In [None]:
#Load all required packages
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.correlation import plot_corr
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [None]:
# load training data and test data
train_df = pd.read_csv('Train_Zindi.csv', index_col=0)
test_df = pd.read_csv('Test_Zindi.csv', index_col=0)
riders_df = pd.read_csv('Riders_Zindi.csv', index_col=0)

In [None]:
#Function to convert time rows to seconds after midnight
def time_fn(row):
    b = row.split(' ')
    if b[1] == 'AM':
        c = 0
    else:
        c = 12
    b = b[0].split(':')
    b = [int(i) for i in b]
    if b[0] == 12:
        c -= 12
    # convertion to minutes
    b[0] = (b[0] + c)*60*60
    b[1] = (b[1])*60
    row = sum(b)
    return(row)

In [None]:
#Convertion of times to seconds past midnight:
train_df['Placement - Time'] = train_df['Placement - Time'].apply(lambda x:time_fn(x))
train_df['Confirmation - Time'] = train_df['Confirmation - Time'].apply(lambda x:time_fn(x))
train_df['Arrival at Pickup - Time'] = train_df['Arrival at Pickup - Time'].apply(lambda x:time_fn(x))
train_df['Pickup - Time'] = train_df['Pickup - Time'].apply(lambda x:time_fn(x))
train_df['Arrival at Destination - Time'] = train_df['Arrival at Destination - Time'].apply(lambda x:time_fn(x))

In [None]:
# Remove rows where placement_day != confirmation day and pickup_day
train_df = train_df.drop(train_df[(train_df.iloc[:,4] != train_df.iloc[:,7]) | (train_df.iloc[:,7] != train_df.iloc[:,16])].index)

In [None]:
# Check if any values of delivery time are anomalous
train_df = train_df[(train_df['Placement - Time'] < train_df['Confirmation - Time'])\
|(train_df['Arrival at Pickup - Time'] < train_df['Pickup - Time'])\
|(train_df['Pickup - Time'] < train_df['Arrival at Destination - Time'])]

In [None]:
#Histogram of the distribution of the response
plt.hist(train_df['Time from Pickup to Arrival'],bins = 20)
plt.show()

In [None]:
#Based on histogram above the outliers can be removed(so far they were chosen abitrarily)
train_df = train_df.drop(train_df[train_df['Time from Pickup to Arrival'] < 500].index)
train_df = train_df.drop(train_df[train_df['Time from Pickup to Arrival'] > 5000].index)

In [None]:
#Join train_df with riders dataframe
# train_df = pd.merge(train_df,riders_df,how='left',on=['Rider Id','Rider Id'])

In [None]:
# Choose model_features and model target(Note 'Arrival at destination is dropped')
X = train_df.loc[:,['Platform Type',\
                    'Personal or Business',\
                    'Placement - Day of Month',\
                    'Placement - Weekday (Mo = 1)',\
                    'Placement - Time',\
                    'Confirmation - Time',\
                    'Arrival at Pickup - Time',\
                    'Pickup - Time',\
                    'Distance (KM)',\
                    'Temperature',\
                    'Precipitation in millimeters',\
#                     'No_Of_Orders',\
#                     'Age',\
#                     'Average_Rating',\
#                     'No_of_Ratings'
                   ]]
Y = train_df.loc[:,'Time from Pickup to Arrival']
# Y_arr = train_df.loc[:,'Arrival at Destination - Time']

In [None]:
# Impute Null Temperature values with average temperature
imputing_col = ['Temperature','Precipitation in millimeters']
imputer_avg = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer_zero = SimpleImputer(missing_values = np.nan, strategy = 'constant',fill_value = 0)
for col in imputing_col:
    if col == 'Temperature':
        X[col] = imputer_avg.fit_transform(X.loc[:,[col]])
    else:
        X[col] = imputer_zero.fit_transform(X.loc[:,[col]]) 

In [None]:
# One-Hot Encode nominal categories
categorical_columns = ['Platform Type','Personal or Business','Placement - Weekday (Mo = 1)','Placement - Day of Month']
OH_X = pd.get_dummies(X,columns = categorical_columns)

In [None]:
# Check for linearity
fig, axs = plt.subplots(9,3, figsize=(14,98))
fig.subplots_adjust(hspace = 0.5, wspace=.2)
axs = axs.ravel()

for index, column in enumerate(OH_X.columns):
    axs[index].set_title("{} vs. Ar at dest".format(column))
    axs[index].scatter(x=OH_X[column],y=Y,c='blue',edgecolor='k')

In [None]:
#check for collinear variables
corr = OH_X.corr()
fig, axz = plt.subplots(1,1, figsize=(14, 10))
fig = plot_corr(corr,xnames=corr.columns,ax=axz)

In [None]:
# Separate data into training, test data and validation data
X_train,X_test,Y_train,Y_test = train_test_split(OH_X,Y, test_size =0.2, random_state = 42)
X_train,X_val,Y_train,Y_val = train_test_split(X_train,Y_train, test_size = 0.125, random_state =42)

In [None]:
# Feature Scaling (Feature scale numeric data on training set)
# sc = StandardScaler()
# numeric_columns = ['Placement - Time',\
#                     'Confirmation - Time',\
#                     'Arrival at Pickup - Time',\
#                     'Pickup - Time',\
#                     'Distance (KM)',\
#                     'Temperature',\
# #                     'No_Of_Orders',\
# #                     'Age',\
# #                     'Average_Rating',\
# #                     'No_of_Ratings'
#                   ]
# X_train.loc[:,numeric_columns] = sc.fit_transform(X_train.loc[:,numeric_columns])

In [None]:
# Import REGRESSION MODEL
from sklearn.linear_model import LinearRegression
LM_model = LinearRegression()
LM_model.fit(X_train,Y_train)

In [None]:
Y_pred =LM_model.predict(X_test)

In [None]:
mse = mean_squared_error(Y_test, Y_pred)
rmse = math.sqrt(mse)

In [None]:
mse

In [None]:
rmse

In [None]:
math.sqrt(((Y_test - Y_pred)**2).mean())