In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import itertools
import warnings

warnings.filterwarnings('ignore')

In [2]:
# test_df = pd.read_csv("bikerslogistics/test.csv")
df = pd.read_csv("bikerslogistics/train.csv")

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
User_ID,UserLogger_7277,UserLogger_1281,UserLogger_2566,UserLogger_11805,UserLogger_15199
Tranport_Vehicle,Motorcycle,Motorcycle,Motorcycle,Motorcycle,Motorcycle
Platform,P3,P3,P3,P3,P3
Purpose,Commercial,Commercial,Commercial,Commercial,Commercial
Order_Day_of_Month,27,25,12,28,12
Order_Week_of_Month,1,1,6,4,2
Time_of_Order,9:18:22 AM,12:07:04 PM,1:20:11 PM,11:13:53 AM,1:41:02 PM
Order_Confirm_Day_of_Month,27,25,12,28,12
Order_Confirm_Day_of_Weekday,1,1,6,4,2
Time_of_Confirmation,9:18:42 AM,12:08:36 PM,1:34:44 PM,11:14:07 AM,1:41:57 PM


### Data Cleaning
#### Drop Irrelevant columns

In [6]:
print(df.shape)
# df.head().T

(14840, 27)


In [7]:
## Drop duplicated columns
combinations = list(itertools.combinations(df.columns,2))
remove = []
for f1, f2 in combinations:
    if(f1 not in remove) & (f2 not in remove):
        if df[f1].equals(df[f2]):
            remove.append(f1)

print(remove)
df['dom'] = df['Order_Confirm_Day_of_Month']
df['wom'] = df['Order_Confirm_Day_of_Weekday']
df.drop(remove, axis=1, inplace=True)

['Order_Confirm_Day_of_Month', 'Order_Confirm_Day_of_Weekday', 'Arrival_At_Pickup_MonthDay', 'Arrival_At_Pickup_Weekday', 'Pickup_MonthDay', 'Pickup_WeekDay']


In [8]:
## Drop irrelevant and 0 variance columns
drp_list = ['User_ID', 'Tranport_Vehicle', 'Latitude_Pickup', 'Longitude_Pickup', 'Latitude_Destination', 'Longitude_Destination']
df.drop(columns=drp_list, inplace = True)

In [9]:
## fill Missing values
df['Temperation'] = df['Temperation'].fillna(round(df['Temperation'].mean(), 1))

#-> However on Precipitation, I'm guessing if the record was not collected then it meant it was of low quantity, it did not rain at all. So I will make this into a factor - Raining or not.datetime
def cat_rain(value):
    if value > 7.9:
        return 1 # Raining
    else:
        return 0 # Not Raining

df['Precipitation_in_millimeters_CAT'] = df.apply(lambda df: cat_rain(df['Precipitation_in_millimeters']), axis = 1)
df.drop(columns = 'Precipitation_in_millimeters', inplace=True)


In [10]:
df['Purpose'].value_counts()

Commercial    12144
Personal       2696
Name: Purpose, dtype: int64

In [12]:
lst = ['Time_of_Order', 'Time_of_Confirmation',	'Arrival_at_Pickup_Time', 'Pickup_Time', 'Delivery_Time']

for i in lst:
    j = i + '_in_hours'
    df[j] = df[i]
    df.drop(columns = i, inplace = True)
    df[j] = pd.to_datetime(df[j]) 
    ## corce to hours past midnight
    df[j] = df[j].apply(lambda x: round((((x.value -1628122036000000000)/ 10**9)/60),2) )


In [13]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
df['Purpose_CAT'] = LE.fit_transform(df['Purpose']) 
df['Platform_CAT'] = LE.fit_transform(df['Platform'])

df.drop(columns = ['Purpose', 'Platform'], inplace = True)

In [14]:
new_Columns_list = ['Order_Day_of_Month', 'Order_Week_of_Month', 'Delivery_MonthDay',
       'Delivery_Weekday', 'DistanceCovered_KM', 'Temperation',
       'Precipitation_in_millimeters_CAT', 'Time_of_Order_in_hours',
       'Time_of_Confirmation_in_hours', 'Arrival_at_Pickup_Time_in_hours',
       'Pickup_Time_in_hours', 'Delivery_Time_in_hours', 'Purpose_CAT',
       'Platform_CAT',
       'Time_Elapsed_from_Pickup_to_Delivery_in_Min']

df = df[new_Columns_list]

In [15]:
df.head().T

Unnamed: 0,0,1,2,3,4
Order_Day_of_Month,27.0,25.0,12.0,28.0,12.0
Order_Week_of_Month,1.0,1.0,6.0,4.0,2.0
Delivery_MonthDay,27.0,25.0,12.0,28.0,12.0
Delivery_Weekday,1.0,1.0,6.0,4.0,2.0
DistanceCovered_KM,6.0,9.0,9.0,9.0,7.0
Temperation,20.9,29.9,28.5,26.8,30.1
Precipitation_in_millimeters_CAT,0.0,0.0,0.0,0.0,0.0
Time_of_Order_in_hours,1991.1,2159.8,2232.92,2106.62,2253.77
Time_of_Confirmation_in_hours,1991.43,2161.33,2247.47,2106.85,2254.68
Arrival_at_Pickup_Time_in_hours,2019.12,2194.0,2267.58,2118.27,2313.55


## Build Model

In [16]:
## Import Libraries

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [17]:
X = df.drop(columns='Time_Elapsed_from_Pickup_to_Delivery_in_Min').values
y = df['Time_Elapsed_from_Pickup_to_Delivery_in_Min'].values

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=10)

print('Size of x_train = ', X_train.shape)
print('Size of x_test  = ', X_test.shape)
print('Size of y_train = ', y_train.shape)
print('Size of y_test  = ', y_test.shape)

Size of x_train =  (11130, 14)
Size of x_test  =  (3710, 14)
Size of y_train =  (11130,)
Size of y_test  =  (3710,)


In [18]:
## Normalizing: Feature Scaling
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [19]:
## Model Selection 
# -> To save computing time two models thatdid not perform well were commented 

models = []
models.append(('LR', LinearRegression()))
# models.append(('RF', RandomForestRegressor())) 
models.append(('RDG', Ridge()))
# models.append(('GBR', GradientBoostingRegressor()))

Due to the numeric structure of our data the accuracy would be evaluated using the __Root Mean Squared Error__ [RMSE] which produces the standard deviation of Predicted values from the Auctual Values, basically the lower this score the better.

In [20]:
## Model Evaluation
results = []
names = []
for name, model in models:
    fit_model = model.fit(X_train, y_train)
    y_pred = fit_model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    ## Cross Validation
    cv = cross_val_score(model, X, y, cv=7)

    results.append((r2_score, rmse))
    names.append(name)
    print()
    print(cv)
    print()
    print('{}:R2 {} - RMSE {}'.format(name, round(r2, 3), rmse))



[0.99999992 0.99999992 0.99999991 0.99999992 0.99999992 0.99999991
 0.99999992]

LR:R2 1.0 - RMSE 0.004719740406735618

[0.99999992 0.99999992 0.99999991 0.99999992 0.99999992 0.99999991
 0.99999992]

RDG:R2 1.0 - RMSE 0.3079498416861402


In [21]:
### Hyper Parameter Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [3, 2, 1.5, 1, .75],
    'normalize': [True, False],
    'max_iter': [500, 3000, 1000],
    'tol': [3, 2, 1.5, 1, .75],
    'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

RDG = Ridge()
RDG_cv = GridSearchCV(estimator=RDG, param_grid=param_grid, verbose=0)
RDG_cv.fit(X_train, y_train)


GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [3, 2, 1.5, 1, 0.75],
                         'max_iter': [500, 3000, 1000],
                         'normalize': [True, False],
                         'solver': ['auto', 'svd', 'cholesky', 'lsqr',
                                    'sparse_cg', 'sag', 'saga'],
                         'tol': [3, 2, 1.5, 1, 0.75]})

In [22]:
RDG_params = RDG_cv.best_params_
print(RDG_params)

{'alpha': 0.75, 'max_iter': 500, 'normalize': False, 'solver': 'svd', 'tol': 3}


## Model Evaluation

In [23]:
## Model Evaluation Function
def evaluate_model(model, X_test, y_test, modelName, DataImb):
    print('------------------------------------------------')
    print("Model ", modelName, end="\n")
    print("Data Balancing Type ", DataImb)
    ### Model must be ran outside the function
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print("R2 Score", r2)
    print("RMSE", rmse)
    return[modelName, DataImb, r2, rmse]

In [24]:
# model = Ridge(alpha = RDG_params['alpha'],
#                 max_iter = RDG_params['max_iter'],
#                 normalize = RDG_params['normalize'],
#                 solver = RDG_params['solver'],
#                 tol = RDG_params['tol'])

# #-> Fitting Model
# model.fit(X_train, y_train)
# #=> Evaluate Model
# evaluate_model(model, X_test, y_test, 'Tuned Ridge', "Auctual Data")


------------------------------------------------
Model  Ridge
Data Balancing Type  Auctual Data
R2 Score 0.9997809926347809
RMSE 0.23259080092227752


['Ridge', 'Auctual Data', 0.9997809926347809, 0.23259080092227752]

In [25]:
# model_2 = Ridge()
# model_2.fit(X_train, y_train)
# evaluate_model(model_2, X_test, y_test, 'Ordinary Ridge Regressor', "Auctual Data")

------------------------------------------------
Model  Ridge Regressor
Data Balancing Type  Auctual Data
R2 Score 0.9996160862892679
RMSE 0.3079498416861402


['Ridge Regressor', 'Auctual Data', 0.9996160862892679, 0.3079498416861402]

In [26]:
param_grid = {
    'fit_intercept': [True, False],
    'normalize': [True, False],
    'copy_X': [True, False],
    'n_jobs': [True, False],
    'positive': [True, False]
}

LR = LinearRegression()
LR_cv = GridSearchCV(estimator=LR, param_grid=param_grid, verbose=1)
LR_cv.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


GridSearchCV(estimator=LinearRegression(),
             param_grid={'copy_X': [True, False],
                         'fit_intercept': [True, False],
                         'n_jobs': [True, False], 'normalize': [True, False],
                         'positive': [True, False]},
             verbose=1)

In [27]:
LR_params = LR_cv.best_params_
print(LR_params)

### Pipeline
# from sklearn.pipeline import make_pipeline
LR_model = LinearRegression(fit_intercept = LR_params['fit_intercept'],
                            normalize = LR_params['normalize'],
                            copy_X = LR_params['copy_X'],
                            n_jobs = LR_params['n_jobs'],
                            positive = LR_params['positive'])
#=> Fitting Model
LR_model.fit(X_train, y_train)
#=> Evaluate Model
evaluate_model(LR_model, X_test, y_test, 'Tuned_Linear Regressor', "Auctual Data")

{'copy_X': True, 'fit_intercept': True, 'n_jobs': True, 'normalize': False, 'positive': False}
------------------------------------------------
Model  Linear Regressor
Data Balancing Type  Auctual Data
R2 Score 0.9999999098200736
RMSE 0.004719740406735618


['Linear Regressor', 'Auctual Data', 0.9999999098200736, 0.004719740406735618]

In [28]:
model_4 = LinearRegression()
model_4.fit(X_train, y_train)
evaluate_model(model_4, X_test, y_test, 'ORDINARY_LR', "Auctual Data")

------------------------------------------------
Model  LR
Data Balancing Type  Auctual Data
R2 Score 0.9999999098200736
RMSE 0.004719740406735618


['LR', 'Auctual Data', 0.9999999098200736, 0.004719740406735618]

The result from Linear and Ridge Regression are astounding, let's see how they can be ensembled.

I have tuned and ensembled several models - the tuned Simple Linear Regression performs best. 

Now the model would be saved/pickled for reference

In [29]:
#=> Save/Pickle Model
import pickle

model_name = "model.pk_dsn_0805"
pickle.dump(model, open(model_name, "wb"))

## Prepping Submission Work
1. Checking the model's accuracy on the validation/test set
1. Saving the result for submission

In [36]:
## Validation

df_2 = pd.read_csv("bikerslogistics/test.csv")

data_id = df_2['User_ID']

## Functions
def cat_rain(value):
    if value > 7.9:
        return 1 # Raining
    else:
        return 0 # Not Raining

df_2['Precipitation_in_millimeters_CAT'] = df_2.apply(lambda df_2: cat_rain(df_2['Precipitation_in_millimeters']), axis=1)
df_2.drop(columns = 'Precipitation_in_millimeters', inplace = True)

def data_preproc(df_2):
    df_2['Temperation'] = df_2['Temperation'].fillna(round(df_2['Temperation'].mean(), 1))

    lst = ['Time_of_Order', 'Time_of_Confirmation',	'Arrival_at_Pickup_Time', 'Pickup_Time', 'Delivery_Time']

    for i in lst:
        j = i + '_in_hours'
        df_2[j] = df_2[i]
        df_2.drop(columns = i, inplace = True)
        df_2[j] = pd.to_datetime(df_2[j]) 
        ## corce to hours past midnight
        df_2[j] = df_2[j].apply(lambda x: round((((x.value -1628122036000000000)/ 10**9)/3600),2) )

    LE = LabelEncoder()
    df_2['Purpose_CAT'] = LE.fit_transform(df_2['Purpose'])
    df_2['Platform_CAT'] = LE.fit_transform(df_2['Platform'])


    drp_list = ['User_ID', 'Tranport_Vehicle',
                    'Latitude_Pickup', 'Longitude_Pickup',
                    'Latitude_Destination', 'Longitude_Destination',
                    'Order_Confirm_Day_of_Month', 'Order_Confirm_Day_of_Weekday',
                    'Arrival_At_Pickup_MonthDay', 'Arrival_At_Pickup_Weekday',
                    'Pickup_MonthDay', 'Pickup_WeekDay',
                    'Purpose', 'Platform']


    new_Columns_list = ['Order_Day_of_Month', 'Order_Week_of_Month', 'Delivery_MonthDay',
                            'Delivery_Weekday', 'DistanceCovered_KM', 'Temperation',
                            'Precipitation_in_millimeters_CAT', 'Time_of_Order_in_hours',
                            'Time_of_Confirmation_in_hours', 'Arrival_at_Pickup_Time_in_hours',
                            'Pickup_Time_in_hours', 'Delivery_Time_in_hours', 'Purpose_CAT',
                            'Platform_CAT']


    df_2.drop(columns=drp_list, inplace=True)

    df_2 = df_2[new_Columns_list]

data_preproc(df_2)

In [37]:
sc = StandardScaler()
df_2 = sc.fit_transform(df_2)

In [38]:
predict = LR_model.predict(df_2)

In [39]:
index_list = []
for i in data_id:
    index_list.append(i)

submission_dta = pd.DataFrame(list(zip(index_list, predict)), columns = ['User_ID','Time_Elapsed_from_Pickup_to_Delivery_in_Min'])

submission_dta['Time_Elapsed_from_Pickup_to_Delivery_in_Min'] = round(submission_dta['Time_Elapsed_from_Pickup_to_Delivery_in_Min'], 2)

submission_dta.head()

Unnamed: 0,User_ID,Time_Elapsed_from_Pickup_to_Delivery_in_Min
0,UserLogger_6224,35.05
1,UserLogger_16205,28.98
2,UserLogger_17099,0.28
3,UserLogger_9892,56.37
4,UserLogger_8487,15.62


In [40]:
submission_dta.to_csv("submission/submission_AMAH_AD.csv", index=False)

In [41]:
submission_dta.head()

Unnamed: 0,User_ID,Time_Elapsed_from_Pickup_to_Delivery_in_Min
0,UserLogger_6224,35.05
1,UserLogger_16205,28.98
2,UserLogger_17099,0.28
3,UserLogger_9892,56.37
4,UserLogger_8487,15.62


In [42]:
submission_dta.shape

(6361, 2)

In [None]:
## COMPLIMENTARY

This work has been productionize and a real-time application can be found on this [webApp](https://delivery-stuber-dsn.herokuapp.com/)