In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [None]:
df_train_set = pd.read_csv('train_aggregated.csv')

In [None]:
# first five rows of dataset 
df_train_set.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets
0,1442,17-10-17,7:15,Migori,Bus,49,1.0
1,5437,19-11-17,7:12,Migori,Bus,49,1.0
2,5710,26-11-17,7:05,Keroka,Bus,49,1.0
3,5777,27-11-17,7:10,Homa Bay,Bus,49,5.0
4,5778,27-11-17,7:12,Migori,Bus,49,31.0


In [None]:
#dropping unnecessary column
df_train_set.drop(["ride_id"], axis=1, inplace=True)

In [None]:
#converting travel date to datetime format 
df_train_set["travel_date"] = pd.to_datetime(df_train_set["travel_date"],infer_datetime_format=True)
df_train_set["travel_day"] = df_train_set["travel_date"].dt.dayofyear

In [None]:
df_train_set["car_type"] = pd.Categorical(df_train_set["car_type"])
car_type_categories = df_train_set.car_type.cat.categories
df_train_set["car_type"] = df_train_set.car_type.cat.codes

In [None]:
df_train_set["travel_from"] = pd.Categorical(df_train_set["travel_from"])
travel_from_categories = df_train_set.travel_from.cat.categories
df_train_set["travel_from"] = df_train_set.travel_from.cat.codes

In [None]:
#express travel time in hours
df_train_set["travel_time"] = df_train_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) + (int(x[1])/60))

In [None]:
#num_of_tickets per hour
ticks = df_train_set.groupby("travel_time")["number_of_tickets"].count()
df_train_set["number_of_tickets"] = np.log1p(df_train_set["travel_time"].map(ticks))''''''

In [None]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets,travel_day
0,2017-10-17,7.25,9,0,49,1.0,290
1,2017-11-19,7.2,9,0,49,1.0,323
2,2017-11-26,7.083333,4,0,49,1.0,330
3,2017-11-27,7.166667,1,0,49,5.0,331
4,2017-11-27,7.2,9,0,49,31.0,331


In [None]:
df_train_set['number_of_tickets'] = np.log1p(df_train_set['number_of_tickets'])

In [None]:
df_train_set = df_train_set.set_index('travel_date')
df_train_set.head()

Unnamed: 0_level_0,travel_time,travel_from,car_type,max_capacity,number_of_tickets,travel_day
travel_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-10-17,7.25,9,0,49,0.693147,290
2017-11-19,7.2,9,0,49,0.693147,323
2017-11-26,7.083333,4,0,49,0.693147,330
2017-11-27,7.166667,1,0,49,1.791759,331
2017-11-27,7.2,9,0,49,3.465736,331


In [None]:
df_train_set.groupby(['travel_from', 'travel_time', 'travel_day'], sort=False)

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x080DD350>

In [None]:
df_train_set['period'] = np.nan

In [None]:
df_train_set.loc[df_train_set.travel_time < 7, 'period'] = 'em'
df_train_set.loc[(df_train_set.travel_time >= 7) & (df_train_set.travel_time < 11), 'period'] = 'am'
df_train_set.loc[(df_train_set.travel_time >= 11) & (df_train_set.travel_time < 15), 'period'] = 'mid'
df_train_set.loc[(df_train_set.travel_time >= 15) & (df_train_set.travel_time < 19), 'period'] = 'eve'
df_train_set.loc[(df_train_set.travel_time >= 19) & (df_train_set.travel_time <= 24), 'period'] = 'pm'

In [None]:
df_train_set["period"] = pd.Categorical(df_train_set["period"])
period_categories = df_train_set.period.cat.categories
df_train_set["period"] = df_train_set.period.cat.codes

In [None]:
tcount = dict(df_train_set["period"].value_counts())
tcount

{0: 4888, 1: 988, 3: 286, 2: 87}

In [None]:
df_train_set["hourly_travelers"] = np.log1p(df_train_set["period"].map(tcount))
df_train_set.head()

Unnamed: 0_level_0,travel_time,travel_from,car_type,max_capacity,number_of_tickets,travel_day,period,hourly_travelers
travel_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-10-17,7.25,9,0,49,0.693147,290,0,8.494743
2017-11-19,7.2,9,0,49,0.693147,323,0,8.494743
2017-11-26,7.083333,4,0,49,0.693147,330,0,8.494743
2017-11-27,7.166667,1,0,49,1.791759,331,0,8.494743
2017-11-27,7.2,9,0,49,3.465736,331,0,8.494743


In [None]:
tcount = dict(df_train_set["travel_day"].value_counts())
df_train_set["daily_travelers"] = np.log1p(df_train_set["travel_day"].map(tcount))
df_train_set.head()

Unnamed: 0_level_0,travel_time,travel_from,car_type,max_capacity,number_of_tickets,travel_day,period,hourly_travelers,daily_travelers
travel_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-10-17,7.25,9,0,49,0.693147,290,0,8.494743,0.693147
2017-11-19,7.2,9,0,49,0.693147,323,0,8.494743,0.693147
2017-11-26,7.083333,4,0,49,0.693147,330,0,8.494743,0.693147
2017-11-27,7.166667,1,0,49,1.791759,331,0,8.494743,3.871201
2017-11-27,7.2,9,0,49,3.465736,331,0,8.494743,3.871201


In [None]:
df_train_set.shape

(6249, 9)

In [None]:
""" Separate the data into features and targets """
target_var = ['number_of_tickets']
features, targets = df_train_set.drop(target_var, axis=1), df_train_set[target_var]

In [None]:
train_X, val_X, train_y, val_y = train_test_split(features, targets, test_size=0.2, random_state=0)

# Random forest model

In [None]:
X = train_X
y = train_y

In [None]:
#identifying optimum features
def get_mae(max_leaf_nodes, X, val_X, y, val_y):
    fmod = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=42, criterion='mae', n_estimators=1000, n_jobs=-1)
    fmod.fit(X, y)
    preds = fmod.predict(val_X)
    ma_e = mean_absolute_error(np.expm1(val_y), np.expm1(preds))
    return ma_e

In [None]:
for max_leaf_nodes in [10,100,120,140,200,400]:
    my_mae = get_mae(max_leaf_nodes, X, val_X, y, val_y)
    print(f"Max_leaf_nodes: {max_leaf_nodes}, \t MAE: {my_mae}")

In [None]:
model = RandomForestRegressor(criterion='mae', max_depth=100, max_leaf_nodes=900, 
                              min_samples_leaf= 3, min_samples_split=8,
                              n_estimators=1500, n_jobs=-1, random_state=42)

In [None]:
model.fit(X,y)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=100,
           max_features='auto', max_leaf_nodes=900,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=8,
           min_weight_fraction_leaf=0.0, n_estimators=1500, n_jobs=-1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [None]:
preds_train_set = model.predict(val_X)

In [None]:
print (mean_absolute_error(np.expm1(preds_train_set), np.expm1(val_y)))

3.273359441266504


# Predictions for test set

In [None]:
df_test_set = pd.read_csv('test_questions.csv')

In [None]:
df_test_set.drop(['travel_to'], axis=1, inplace=True)

In [None]:
df_test_set["travel_date"] = pd.to_datetime(df_test_set["travel_date"],infer_datetime_format=True)
df_test_set["travel_day"] = df_test_set["travel_date"].dt.dayofyear

In [None]:
df_test_set["car_type"] = pd.Categorical(df_test_set["car_type"], categories=car_type_categories)
df_test_set["car_type"] = df_test_set.car_type.cat.codes

In [None]:
df_test_set["travel_from"] = pd.Categorical(df_test_set["travel_from"], categories=travel_from_categories)
df_test_set["travel_from"] = df_test_set.travel_from.cat.codes

In [None]:
df_test_set["travel_time"] = df_test_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) + int(x[1])/60)

In [None]:
df_test_set = df_test_set.set_index('travel_date')
df_test_set.head()

Unnamed: 0_level_0,ride_id,travel_time,travel_from,car_type,max_capacity,travel_day
travel_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-05-07,247,7.1,7,0,49,127
2018-05-06,256,11.133333,7,1,11,126
2018-05-04,275,5.0,7,1,11,124
2018-05-04,285,9.166667,7,1,11,124
2018-05-04,286,9.333333,7,1,11,124


In [None]:
df_test_set.groupby(['travel_time', 'travel_from', 'travel_day'])

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x0C9C1AB0>

In [None]:
df_test_set['period'] = np.nan

In [None]:
df_test_set.loc[df_test_set.travel_time < 7, 'period'] = 'em'
df_test_set.loc[(df_test_set.travel_time >= 7) & (df_test_set.travel_time < 11), 'period'] = 'am'
df_test_set.loc[(df_test_set.travel_time >= 11) & (df_test_set.travel_time < 15), 'period'] = 'mid'
df_test_set.loc[(df_test_set.travel_time >= 15) & (df_test_set.travel_time < 19), 'period'] = 'eve'
df_test_set.loc[(df_test_set.travel_time >= 19) & (df_test_set.travel_time <= 24), 'period'] = 'pm'

In [None]:
pcount = dict(df_test_set['period'].value_counts())
pcount

{'am': 791, 'em': 145, 'pm': 123, 'mid': 52}

In [None]:
df_test_set['hourly_travelers'] = np.log1p(df_test_set['period'].map(pcount))

In [None]:
dcount = dict(df_test_set["travel_day"].value_counts())
df_test_set["daily_travelers"] = np.log1p(df_test_set["travel_day"].map(dcount))
df_test_set.head()

Unnamed: 0_level_0,ride_id,travel_time,travel_from,car_type,max_capacity,travel_day,period,hourly_travelers,daily_travelers
travel_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-05-07,247,7.1,7,0,49,127,am,6.674561,1.94591
2018-05-06,256,11.133333,7,1,11,126,mid,3.970292,2.564949
2018-05-04,275,5.0,7,1,11,124,em,4.983607,4.094345
2018-05-04,285,9.166667,7,1,11,124,am,6.674561,4.094345
2018-05-04,286,9.333333,7,1,11,124,am,6.674561,4.094345


In [None]:
df_test_set["period"] = pd.Categorical(df_test_set["period"])
period_categories = df_test_set.period.cat.categories
df_test_set["period"] = df_test_set.period.cat.codes

Now let's calculate predictions using the random forest model we trained.

In [None]:
X_test = df_test_set.drop(['ride_id'], axis=1)
test_set_predictions = model.predict(X_test)

And finally let's create a csv file with predictions. 

In [None]:
d = {'ride_id': df_test_set.ride_id, 'number_of_ticket': np.round(np.expm1(test_set_predictions), 0)}
df_predictions = pd.DataFrame(data=d)

In [None]:
df_predictions.head()

Unnamed: 0_level_0,ride_id,number_of_ticket
travel_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-05-07,247,3.0
2018-05-06,256,1.0
2018-05-04,275,1.0
2018-05-04,285,9.0
2018-05-04,286,9.0


In [None]:
df_predictions.to_csv('zindi_submission_file.csv', index=False) #save to csv file