In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error


In [2]:
%matplotlib inline  
pd.set_option('display.max_columns', None)


In [3]:
train = pd.read_csv("../data/Train.csv")
test = pd.read_csv("../data/Test.csv")

In [4]:
for time_column in ['Placement - Time', 'Confirmation - Time', 'Arrival at Pickup - Time', 'Pickup - Time', 'Arrival at Destination - Time']:
    train.loc[:, time_column] = pd.to_datetime(train.loc[:, time_column]).dt.strftime('%H:%M:%S')
for time_column in ['Placement - Time', 'Confirmation - Time', 'Arrival at Pickup - Time', 'Pickup - Time']:
    test.loc[:, time_column] = pd.to_datetime(test.loc[:, time_column]).dt.strftime('%H:%M:%S')

In [5]:
train.loc[:, "Day of Month"] = train["Arrival at Pickup - Day of Month"]
train.loc[:, "Weekday (Mo = 1)"] = train["Arrival at Pickup - Weekday (Mo = 1)"]
train.drop(['Placement - Day of Month', 'Placement - Weekday (Mo = 1)', 'Confirmation - Day of Month',
            'Confirmation - Weekday (Mo = 1)', 'Arrival at Pickup - Day of Month',
            'Arrival at Pickup - Weekday (Mo = 1)', 'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)',
           'Arrival at Destination - Day of Month', 'Arrival at Destination - Weekday (Mo = 1)'], inplace=True, axis=1)
test.loc[:, "Day of Month"] = test["Arrival at Pickup - Day of Month"]
test.loc[:, "Weekday (Mo = 1)"] = test["Arrival at Pickup - Weekday (Mo = 1)"]
test.drop(['Placement - Day of Month', 'Placement - Weekday (Mo = 1)', 'Confirmation - Day of Month',
            'Confirmation - Weekday (Mo = 1)', 'Arrival at Pickup - Day of Month',
            'Arrival at Pickup - Weekday (Mo = 1)', 'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)'], inplace=True, axis=1)


In [9]:
[i for i in train.columns if i not in test.columns]

['Arrival at Destination - Time', 'Time from Pickup to Arrival', 'avgSpeed']

I am going to use different training targets to hopefully form a decent ensemble at the end of the day

In [10]:
train.head()

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Time,Confirmation - Time,Arrival at Pickup - Time,Pickup - Time,Arrival at Destination - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival,Day of Month,Weekday (Mo = 1),avgSpeed
0,Order_No_4211,User_Id_633,Bike,3,Business,09:35:46,09:40:10,10:04:47,10:27:30,10:39:55,4,20.4,,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745,9,5,19.328859
1,Order_No_25375,User_Id_2285,Bike,3,Personal,11:16:16,11:23:21,11:40:22,11:44:09,12:17:22,16,26.4,,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993,12,5,28.901154
2,Order_No_1899,User_Id_265,Bike,3,Business,12:39:25,12:42:44,12:49:34,12:53:03,13:00:38,3,,,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155,455,30,2,23.736264
3,Order_No_9336,User_Id_1402,Bike,3,Business,09:25:34,09:26:05,09:37:56,09:43:06,10:05:27,9,19.2,,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855,1341,15,5,24.161074
4,Order_No_27883,User_Id_1737,Bike,1,Personal,09:55:18,09:56:18,10:03:53,10:05:23,10:25:37,9,15.4,,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770,1214,13,1,26.688633


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21201 entries, 0 to 21200
Data columns (total 22 columns):
Order No                         21201 non-null object
User Id                          21201 non-null object
Vehicle Type                     21201 non-null object
Platform Type                    21201 non-null int64
Personal or Business             21201 non-null object
Placement - Time                 21201 non-null object
Confirmation - Time              21201 non-null object
Arrival at Pickup - Time         21201 non-null object
Pickup - Time                    21201 non-null object
Arrival at Destination - Time    21201 non-null object
Distance (KM)                    21201 non-null int64
Temperature                      16835 non-null float64
Precipitation in millimeters     552 non-null float64
Pickup Lat                       21201 non-null float64
Pickup Long                      21201 non-null float64
Destination Lat                  21201 non-null float64
Destinatio

In [12]:
train.loc[:, 'Platform Type'] = train.loc[:, 'Platform Type'].astype('object')
train.loc[:, 'Weekday (Mo = 1)'] = train['Weekday (Mo = 1)'].astype('object')
train.loc[:, 'Day of Month'] = train['Day of Month'].astype('object')

In [13]:
# train.drop(['Order No', 'User Id', 'Temperature', 'Precipitation in millimeters', 'Vehicle Type'], axis=1, inplace=True)

Trying minimal features

In [14]:
selected_columns = ['Distance (KM)', 'Pickup - Time', 'Pickup Lat', 'Pickup Long', 'Destination Lat', 'Destination Long', 'Day of Month', 'Weekday (Mo = 1)' ]
train_x = train.loc[:,selected_columns]

In [15]:
train_duration = train.loc[:, 'Time from Pickup to Arrival']
train_end_time = train.loc[:, 'Arrival at Destination - Time']

In [16]:
s = (train_x.dtypes == 'object')
object_cols = list(s[s].index)
del s

In [17]:
train_x, val_x, train_duration, val_duration, train_end_time, val_end_time = train_test_split(train_x, train_duration, train_end_time, random_state = 42)

In [21]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_x[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_x[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = train_x.index
OH_cols_valid.index = val_x.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_x.drop(object_cols, axis=1)
num_X_valid = val_x.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)


In [27]:
random_forest_regressor = RandomForestRegressor(n_estimators=10)

In [28]:
random_forest_regressor.fit(OH_X_train, train_duration)

random_forest_regressor.score(OH_X_valid, val_duration)

0.31440075785572585

In [29]:
train.loc[:, 'avgSpeed'] =train['Distance (KM)'] / (train['Time from Pickup to Arrival'] / 3600 ) 

In [34]:
fast_train = train[train.avgSpeed > 60]
useful_train = train.drop(fast_train.index)

In [35]:
selected_columns = ['Distance (KM)', 'Pickup - Time', 'Pickup Lat', 'Pickup Long', 'Destination Lat', 'Destination Long', 'Day of Month', 'Weekday (Mo = 1)' ]
train_x = useful_train.loc[:,selected_columns]

train_duration = useful_train.loc[:, 'Time from Pickup to Arrival']
train_end_time = useful_train.loc[:, 'Arrival at Destination - Time']

s = (train_x.dtypes == 'object')
object_cols = list(s[s].index)
del s

train_x, val_x, train_duration, val_duration, train_end_time, val_end_time = train_test_split(train_x, train_duration, train_end_time, random_state = 42)

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_x[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_x[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = train_x.index
OH_cols_valid.index = val_x.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_x.drop(object_cols, axis=1)
num_X_valid = val_x.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)




MemoryError: Unable to allocate array with shape (12161, 5028) and data type float64

In [40]:
random_forest_regressor = RandomForestRegressor(n_estimators=10)

random_forest_regressor.fit(OH_X_train, train_duration)

random_forest_regressor.score(OH_X_valid, val_duration)

0.328144462005191

In [42]:
from sklearn import linear_model

In [43]:
ridge_regressor = linear_model.Ridge()

In [44]:
ridge_regressor.fit(OH_X_train, train_duration)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [46]:
ridge_regressor.score(OH_X_valid, val_duration)

0.31190025723025216