# EDSA 2021: Sendy Logistics Challenge
Predict the estimated time of arrival (ETA) for motorbike deliveries in Nairobi

# Overview

### - Introduction
### - Importing libraries and data
### - Exploratory Data Analysis
### - Data Cleaning and Formatting
### - Feature Engineering
### - Train/Test Split
### - Modeling
### - Making Predictions
### - Submission

# Introduction

In this notebook, based on historic data used to predict an accurate time for the arrival of the rider at the destination of a package, we will be building a machine learning model that predicts an accurate delivery time, from picking up a package to arriving at the final destination. An accurate arrival time prediction will help all businesses to improve their logistics and communicate an accurate time to their customers.

# Importing Python libraries

In [1]:
!pip install lightgbm



In [2]:
!pip install xgboost



In [3]:
#Linear algebra
import numpy as np

#Data processing
import pandas as pd

#Date library
import datetime as dt

#Data visualization
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#Metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV 
from sklearn.model_selection import cross_val_score, learning_curve
from sklearn.metrics import mean_squared_error

#Algorithms
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

#import xgboost as xgb
import lightgbm as lgb

sns.set(style='white', context='notebook', palette='deep')

# Importing the dataset

In [4]:
#Train_Masked has extra columns: Delivery destination (day, month, time)

train_df = pd.read_csv("D:\Temp\Train.csv")
test_df = pd.read_csv("D:\Temp\Test.csv")
rider_df = pd.read_csv("D:\Temp\Riders.csv")
variable_df = pd.read_csv('VariableDefinitions.csv')

print(train.shape, test.shape, rider_df.shape, variable_df.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\Temp\\Train.csv'

# Exploritory Data Analysis

## Understanding the train and test datasets

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

### Check which cols are in train and not in test data

In [None]:
for col in train.columns:
    if col not in test.columns:
        print(col)

### Match the number of train columns to the number of test columns and also separate the target variable

In [None]:
# save the columns from the test dataframe into a new list called testcols
testcols = test_df.columns

# reassign the number of the train dataframe to match that of the test by saving them to an updated dataframe, newtrain
train1 = train_df[testcols]

# Seperate the target variable from the predictor variable
y = np.array(train_df['Time from Pickup to Arrival']).reshape(-1, 1)

In [None]:
#Drop data not available in test, Pickup Time + label = Arrival times

train_df = train_df.drop(['Arrival at Destination - Day of Month', 'Arrival at Destination - Weekday (Mo = 1)', 'Arrival at Destination - Time'], axis=1)

In [None]:
# print the new columns of the train and test data to show that they are the same
print(train1.columns)
print(test_df.columns)

## Missing values

In [None]:
#plot cmap
plt.figure(figsize=(20, 10))
cmap = sns.cubehelix_palette(as_cmap=True, reverse=True)
sns.heatmap(train1.isnull(), cmap=cmap)


In [None]:
train1.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
#Percentage of the missing values
def missing_values(df, threshold=0):
    """Returns the feature name and the % of  missing values as dataframe"""

    return pd.DataFrame(data= {col: (df[col].isnull().mean()) for col in df.columns if df[col].isnull().mean() > threshold}, 
             index=['% of Missing values']
             ).T

In [None]:
missing_values(train1)

In [None]:
missing_values(test_df)

## Distribution

In [None]:
#Distribution review for train data
train1[[col for col in train1.select_dtypes(include='number')]].hist(bins=50, figsize=(20,10))
plt.tight_layout()
plt.show()

In [None]:
#Column Skewness of the testing dataset
train1.skew().plot.bar(figsize =(10,5))
plt.show()

In [None]:
## Univariate Anaysis
# Calculating Skewness in the columns of our Training dataset
train1.kurt().plot.bar(figsize =(10,5))
plt.show()

In [None]:
CorrMatrix_train = train1.corr()
CorrMatrix_train.head(5)

In [None]:
style.use('ggplot')
cm = CorrMatrix_train.corr()
colormap = sns.diverging_palette(220, 10, as_cmap = True)
plt.figure(figsize=(20,10))
sns.heatmap(cm, annot=True, cmap = colormap,cbar_kws={'shrink':.9 },
           linewidths=0.5,vmax=2.0, linecolor='white',annot_kws={'fontsize':12 })
plt.title('Pearson Correlation of Features', y=1.05, size=50)

### Personal or Business

In [None]:
train1['Personal or Business'].value_counts().plot(kind= 'bar' , figsize = [10,5])

### Platform type

In [None]:
train1['Platform Type'].value_counts().plot(kind= 'bar' , figsize = [10,5])

### Vehicle Type

In [None]:
train1['Vehicle Type'].value_counts().plot(kind= 'bar' , figsize = [10,5])

In [None]:
#Outliers Overview for numeric features
plt.figure(figsize=(20, 8))
sns.boxplot(train1['Temperature'])
plt.show()

In [None]:
Numeric_Training = train1._get_numeric_data()
Numeric_Training.head()

In [None]:
train1['Time from Pickup to Arrival'].value_counts().plot(kind= 'bar' , figsize = [10,5])

In [None]:
l = train1.values
number_of_columns= 12
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.set_style('whitegrid')
    sns.boxplot(df[l[i]],color='green',orient='v')
    plt.tight_layout()

In [None]:
plt.figure(figsize=(number_of_columns_Test,8*number_of_rows_Test))
for i in range(0,len(nu_tra)):
    plt.subplot(number_of_rows_Test + 1,number_of_columns_Test,i+1)
    chart=sns.distplot(train1[nu_tra[i]],kde=True) 
    chart.set_xticklabels(chart.get_xticklabels(), rotation=-180, 
                          horizontalalignment='right')
plt.show()

### Target

In [None]:
#Useful statistics about our target column

train1['Time from Pickup to Arrival'].describe()

In [None]:
train1

In [None]:
(train1['Time from Pickup to Arrival']/60).hist(bins=50)
plt.title("Delivery time distribution")
plt.xlabel("Delivery time (minutes)")
plt.ylabel("Frequency")
plt.show()

In [None]:
#Outliers Overview for numeric features
plt.figure(figsize=(20, 8))
sns.boxplot(x="variable", y="value", data=pd.melt(test_df))
plt.show()

In [None]:
plt.figure(figsize=(2*number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.distplot(df[l[i]],kde=True)

# Data Cleaning and Formatting

## Creating Full_df

In [None]:
#Create (full_df = train + test) ** caution (dont shuffle, avoid drop/adding rows)
#explore training, make (column) changes to full, later we use the border to separate
#Be careful of information leakage

border = train1.shape[0]
test_df['Time from Pickup to Arrival'] = [np.nan]* test_df.shape[0]
full_df = pd.concat([train1, test_df], axis=0, ignore_index=True)

train1.shape, test_df.shape, full_df.shape

### Renaming columns

In [None]:
#Renaming columns (shorten, remove space, standardize)
new_names = {"Order No": "Order_No", "User Id": "User_Id", "Vehicle Type": "Vehicle_Type",
    "Personal or Business": "Personal_Business", "Placement - Day of Month": "Pla_Mon",
    "Placement - Weekday (Mo = 1)": "Pla_Weekday", "Placement - Time": "Pla_Time", 
    "Confirmation - Day of Month":"Con_Day_Mon", "Confirmation - Weekday (Mo = 1)": "Con_Weekday","Confirmation - Time": "Con_Time", 
    "Arrival at Pickup - Day of Month": "Arr_Pic_Mon", "Arrival at Pickup - Weekday (Mo = 1)": "Arr_Pic_Weekday", 
                "Arrival at Pickup - Time": "Arr_Pic_Time", "Platform Type": "Platform_Type",
     "Pickup - Day of Month": "Pickup_Mon", "Pickup - Weekday (Mo = 1)": "Pickup_Weekday",           
    "Pickup - Time": "Pickup_Time",  "Distance (KM)": "Distance(km)",
    "Precipitation in millimeters": "Precipitation(mm)", "Pickup Lat": "Pickup_Lat", "Pickup Long": "Pickup_Lon", 
    "Destination Lat": "Destination_Lat", "Destination Long":"Destination_Lon", "Rider Id": "Rider_Id",
                            "Time from Pickup to Arrival": "Time_Pic_Arr"
                           }

full_df = full_df.rename(columns=new_names)
full_df.columns

### Convert Time

In [None]:
#Convert Time from 12H to 24H

def convert_to_24hrs(fulldf):
    for col in fulldf.columns:
        if col.endswith("Time"):
            fulldf[col] = pd.to_datetime(fulldf[col], format='%I:%M:%S %p').dt.strftime("%H:%M:%S")
    return fulldf

full_df = convert_to_24hrs(full_df)

full_df[['Pla_Time', 'Con_Time' , 'Arr_Pic_Time', 'Pickup_Time']][3:6]


### Filling Missing Values

In [None]:
#Filling Missing Values for temperatures and humidity

full_df['Temperature'] = full_df['Temperature'].fillna(full_df['Temperature'].mean())
full_df['Precipitation(mm)'].fillna(full_df['Precipitation(mm)'].mean(), inplace=True)

In [None]:
full_df.head()

### Traversing Month and Weekday

In [None]:
#Since, we have not been given the actual dates & bikes (same day) were used, is Pick, Arrival date not the same?

month_cols = [col for col in full_df.columns if col.endswith("Mon")]
weekday_cols = [col for col in full_df.columns if col.endswith("Weekday")]

count = 0
instances_of_different_days = [];
for i, row in full_df.iterrows():
    if len(set(row[month_cols].values)) > 1:
        print(count+1, end="\r")
        count = count + 1
        instances_of_different_days.append(list(row[month_cols].values))
instances_of_different_days

In [None]:
month_cols

In [None]:
weekday_cols

### Creating Month and Weekday columns

In [None]:
full_df['Day_of_Month'] = full_df[month_cols[0]]
full_df['Day_of_Week'] = full_df[weekday_cols[0]]

### Dropping redundant columns

In [None]:
#All Vehicle types are Bikes, Vehicle Type is not necessary.
#Day & Weekday values are repeated in all rows except 2, we retain only one
full_df.drop(month_cols+weekday_cols, axis=1, inplace=True)
full_df.drop('Vehicle_Type', axis=1, inplace=True)

full_df.head(3)

In [None]:
full_df.head()

In [None]:
full_df.columns

### Variable Datatypes

In [None]:
numeric_cols = []
object_cols = []
time_cols = []
for k, v in full_df.dtypes.items():
    if (v != object):
        if (k != "Time_Pic_Arr"):
            numeric_cols.append(k)
    elif k.endswith("Time"):
        time_cols.append(k)
    else:
        object_cols.append(k)

In [None]:
full_df[numeric_cols].head(3)

In [None]:
full_df[time_cols].head(3)

In [None]:
full_df[object_cols].head(3)

### Convert object types to numeric

In [None]:
#Convert an object to numeric (encoding)

le = LabelEncoder()
le.fit(full_df['Personal_Business'])
full_df['Personal_Business'] = le.transform(full_df['Personal_Business'])
full_df['Personal_Business'][:2]


In [None]:
full_df.head()

# Feature Engineering

### Feature Selection

In [None]:
features = numeric_cols + ['Personal_Business']

data_df = full_df[features]

y = full_df[:border]['Time_Pic_Arr']
train = data_df[:border]
test = data_df[border:]

train.head()

In [None]:
print(full_df.shape,data_df.shape,train.shape,test.shape,y.shape)

In [None]:
train

In [None]:
test

In [None]:
y

# Train/Test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, shuffle=True)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Modeling

### Cross validation

In [None]:
rs = 3
kfold = KFold(n_splits=10, random_state=rs, shuffle=True)

regressors = []
regressors.append(SVR())
regressors.append(GradientBoostingRegressor(random_state=rs))
regressors.append(ExtraTreesRegressor(n_estimators=rs))
regressors.append(RandomForestRegressor(random_state=rs))
#regressors.append(xgb.XGBRegressor(random_state=rs, objective="reg:squarederror"))
regressors.append(lgb.LGBMRegressor(random_state=rs))

cv_results = []
for regressor in regressors:     #scores to be minimised are negated (neg)
    cv_results.append(np.sqrt(abs(cross_val_score(regressor, X_train, y=y_train, scoring='neg_mean_squared_error', cv=kfold))))

cv_means = []
cv_stds = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_stds.append(cv_result.std())
    
#cv_res = pd.DataFrame({ 
#    "Algorithm": ["SVR", "GBR", "EXR", "RFR", "XGBR", "LGBM"],
#    "CrossValMeans": cv_means, "CrossValErrors": cv_stds
#                       })
cv_res = pd.DataFrame({ 
    "Algorithm": ["SVR", "GBR", "EXR", "RFR", "LGBM"],
    "CrossValMeans": cv_means, "CrossValErrors": cv_stds
                       })

cv_res = cv_res.sort_values("CrossValMeans", ascending=True)
print(cv_res)

# Making predictions

## Random Forest

In [None]:
RFC = RandomForestRegressor(random_state=rs)
rf_param = {"max_depth":[None], "max_features":[3], "min_samples_split":[10],
           "min_samples_leaf": [3], "n_estimators":[300]}
rsearch = GridSearchCV(RFC, cv=kfold, scoring='neg_mean_squared_error',param_grid=rf_param)
rfm = rsearch.fit(X_train, y_train)

r_score = np.sqrt(abs(rfm.best_score_))
r_params = rfm.best_p
arams_
print(r_score, r_params)

In [None]:
Prediction = predict(fit, test)
submit = data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit, file = “firstforest.csv”, row.names = FALSE)

In [None]:
lgbm_y = lgbm.predict(test, num_iteration=lgbm.best_iteration)
lgbm_output = pd.DataFrame({"Order No":test_df['Order No'], 
                           "Time from Pickup to Arrival": lgbm_y })
lgbm_output.to_csv("submission.csv", index=False)

In [None]:
y_pred = lr.predict(test_new)

In [None]:
submission_df = test_df1[['Order No']]
submission_df['Time_Pic_Arr'] = y_pred

In [None]:
#submission_df.to_csv('D:/Temp/LRImproved.csv', index = False)

## Lightgbm

### Parameter Tuning

In [None]:
params = {
    'n_estimators':[75], # [75, 95],
    'num_leaves': [15], #[12,15, 17],
    'reg_alpha': [0.02], #[0.02, 0.05],
    'min_data_in_leaf': [300],  #[250, 280, 300]
    'learning_rate': [0.1], #[0.05, 0.1, 0.25],
    'objective': ['regression'] #['regression', None]
    }

lsearch = GridSearchCV(estimator = lgb.LGBMRegressor(random_state=rs), cv=kfold,scoring='neg_mean_squared_error', param_grid=params)
lgbm = lsearch.fit(X_train, y_train)

l_params = lgbm.best_params_
l_score = np.sqrt(abs(lgbm.best_score_))
print(lgbm.best_params_, np.sqrt(abs(lgbm.best_score_)))

## Training and making a prediction

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

lparams = {
           'learning_rate': 0.1, 'min_data_in_leaf': 300, 
           'n_estimators': 75, 'num_leaves': 20, 'random_state':rs,
           'objective': 'regression', 'reg_alpha': 0.02,
          'feature_fraction': 0.9, 'bagging_fraction':0.9}


lgbm = lgb.train(lparams, lgb_train, valid_sets=lgb_eval, num_boost_round=20, early_stopping_rounds=20)

lpred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration)

print("The RMSE of prediction is ", mean_squared_error(y_test, lpred)**0.5)


# Submission

In [None]:
lgbm_y = lgbm.predict(test, num_iteration=lgbm.best_iteration)
lgbm_output = pd.DataFrame({"Order No":test_df['Order No'], 
                           "Time from Pickup to Arrival": lgbm_y })
lgbm_output.to_csv("D:\Temp\submission.csv", index=False)