# Simple Starter Notebook for;
## UmojaHack Africa 2021 #2: Sendy - Delivery Rider Response Challenge by UmojaHack Africa

Can you predict who is the best delivery rider for an order placed via logistics company Sendy?

![Umoja Hack](https://zindpublic.blob.core.windows.net/public/uploads/competition/image/152/thumb_c5ec4e2a-e000-4176-a93c-dd1143c2b60f.png)

The objective of this challenge is to create a machine learning model that will predict whether a rider will accept, decline or ignore an order sent to them.

In [91]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

In [83]:
# Load files
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
riders = pd.read_csv('Riders.csv')
ss = pd.read_csv('SampleSubmission.csv')

In [24]:
# check data shapes
train.shape, test.shape, riders.shape, ss.shape

((179867, 21), (76791, 20), (2632, 4), (76791, 2))

In [25]:
# Preview train
train.head()

Unnamed: 0,ID,order_id,dispatch_day,dispatch_day_of_week,dispatch_time,client_id,client_type,order_license_status,order_carrier_type,vendor_type,...,rider_license_status,rider_carrier_type,rider_amount,rider_lat,rider_long,pickup_lat,pickup_long,drop_off_lat,drop_off_long,target
0,ID_SCUW21PVAU,4435,27,6,09:02:54,593630,Business,0,2,Bike,...,0,1,1080,-42.698343,-17.228539,-42.692371,-17.248305,-42.687442,-17.424682,1
1,ID_2HA7X30JMN,32711,30,7,13:01:37,837729,Personal,0,1,Bike,...,0,1,730,-42.787317,-17.288252,-42.784046,-17.290121,-42.673267,-17.234595,2
2,ID_IAJWDTBY6M,8712,14,2,10:01:00,695129,Personal,0,2,Bike,...,1,1,490,-42.74918,-17.287848,-42.765204,-17.293784,-42.813953,-17.294805,1
3,ID_LKSVPNYMTR,44869,22,3,14:11:16,1504660,Personal,0,2,Bike,...,1,1,510,-42.836266,-17.31192,-42.831913,-17.315311,-42.812409,-17.265441,2
4,ID_O7N8Y918YH,57590,27,5,16:11:38,36869,Business,0,2,Bike,...,0,0,400,-42.828195,-17.322818,-42.836056,-17.318111,-42.828517,-17.302052,0


In [26]:
# Preview test
test.head()

Unnamed: 0,ID,order_id,dispatch_day,dispatch_day_of_week,dispatch_time,client_id,client_type,order_license_status,order_carrier_type,vendor_type,rider_id,rider_license_status,rider_carrier_type,rider_amount,rider_lat,rider_long,pickup_lat,pickup_long,drop_off_lat,drop_off_long
0,ID_3B4D2Q2DSI,28636,2,6,12:10:59,593630,Business,0,2,Bike,57224,1,1,450,-42.912776,-17.265738,-42.881861,-17.280865,-42.880476,-17.23024
1,ID_7MPWFJ9XFI,60465,11,4,17:10:03,2378775,Personal,1,2,Bike,106329,1,1,560,-42.831932,-17.28414,-42.829545,-17.279453,-42.85192,-17.314558
2,ID_5VVT3Q3M5B,25249,30,3,12:02:01,2121681,Personal,0,2,Bike,90551,0,0,450,-42.806437,-17.285166,-42.810915,-17.291086,-42.807165,-17.25386
3,ID_C2GTVS1H7K,21224,27,2,11:10:32,38801,Business,0,2,Bike,90482,1,1,440,-42.828448,-17.301909,-42.828517,-17.302052,-42.795516,-17.287921
4,ID_0YGC8V3PFT,58287,14,6,16:12:55,153111,Business,0,2,Bike,101959,1,1,500,-42.808912,-17.259955,-42.805154,-17.262939,-42.706475,-17.257454


In [27]:
# Preview riders
riders.head()

Unnamed: 0,Rider ID,Active Rider Age,Average Partner Rating,Number of Ratings
0,16261,308,21.05,321
1,8832,224,10.0,27
2,53866,238,17.76,25
3,46368,343,24.56,320
4,45609,399,14.97,214


In [84]:
# Merge rider dataset to train and test sets
train = train.merge(riders, how = 'left', left_on='rider_id', right_on='Rider ID')
test = test.merge(riders, how = 'left', left_on='rider_id', right_on='Rider ID')

# Preview merged dataframe
train.head()

Unnamed: 0,ID,order_id,dispatch_day,dispatch_day_of_week,dispatch_time,client_id,client_type,order_license_status,order_carrier_type,vendor_type,...,rider_long,pickup_lat,pickup_long,drop_off_lat,drop_off_long,target,Rider ID,Active Rider Age,Average Partner Rating,Number of Ratings
0,ID_SCUW21PVAU,4435,27,6,09:02:54,593630,Business,0,2,Bike,...,-17.228539,-42.692371,-17.248305,-42.687442,-17.424682,1,30153,11,10.0,1
1,ID_2HA7X30JMN,32711,30,7,13:01:37,837729,Personal,0,1,Bike,...,-17.288252,-42.784046,-17.290121,-42.673267,-17.234595,2,20884,68,24.13,229
2,ID_IAJWDTBY6M,8712,14,2,10:01:00,695129,Personal,0,2,Bike,...,-17.287848,-42.765204,-17.293784,-42.813953,-17.294805,1,33143,273,24.92,123
3,ID_LKSVPNYMTR,44869,22,3,14:11:16,1504660,Personal,0,2,Bike,...,-17.31192,-42.831913,-17.315311,-42.812409,-17.265441,2,96531,168,23.76,175
4,ID_O7N8Y918YH,57590,27,5,16:11:38,36869,Business,0,2,Bike,...,-17.322818,-42.836056,-17.318111,-42.828517,-17.302052,0,103546,95,24.53,42


In [85]:
# Inspect data
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179867 entries, 0 to 179866
Data columns (total 25 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   ID                      179867 non-null  object 
 1   order_id                179867 non-null  int64  
 2   dispatch_day            179867 non-null  int64  
 3   dispatch_day_of_week    179867 non-null  int64  
 4   dispatch_time           179867 non-null  object 
 5   client_id               179867 non-null  int64  
 6   client_type             179867 non-null  object 
 7   order_license_status    179867 non-null  int64  
 8   order_carrier_type      179867 non-null  int64  
 9   vendor_type             179867 non-null  object 
 10  rider_id                179867 non-null  int64  
 11  rider_license_status    179867 non-null  int64  
 12  rider_carrier_type      179867 non-null  int64  
 13  rider_amount            179867 non-null  int64  
 14  rider_lat           

In [41]:
# Convert object datatypes to integers
pd.to_datetime(train['dispatch_time'])
pd.to_datetime(test['dispatch_time'])

0       2021-03-27 12:10:59
1       2021-03-27 17:10:03
2       2021-03-27 12:02:01
3       2021-03-27 11:10:32
4       2021-03-27 16:12:55
                ...        
76786   2021-03-27 10:02:02
76787   2021-03-27 12:08:30
76788   2021-03-27 11:02:59
76789   2021-03-27 09:02:34
76790   2021-03-27 12:02:54
Name: dispatch_time, Length: 76791, dtype: datetime64[ns]

In [86]:
train = train.drop(['dispatch_time', 'ID'], axis = 1)
test = test.drop(['dispatch_time','ID'], axis = 1)

In [68]:
# Preview object datatype column
train['client_type'].value_counts()

Business    145902
Personal     33965
Name: client_type, dtype: int64

In [87]:
# One hot encoding
train = pd.get_dummies(train, columns=['client_type', 'vendor_type'])
test = pd.get_dummies(test, columns=['client_type', 'vendor_type'])

In [70]:
train['target']

0         1
1         2
2         1
3         2
4         0
         ..
179862    2
179863    0
179864    1
179865    0
179866    2
Name: target, Length: 179867, dtype: int64

In [88]:
# Split the data using all but the target column
X = train.drop('target', axis = 1)
y = train['target']


In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3031)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(143893, 23)
(35974, 23)
(143893,)
(35974,)


In [50]:
from sklearn.ensemble import GradientBoostingClassifier

In [72]:
grb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=7, max_features=5)

In [100]:
grb.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.01, max_depth=7, max_features=5,
                           n_estimators=500)

In [92]:
model = LGBMClassifier(random_state=3031)
model.fit(X_train, y_train)

LGBMClassifier(random_state=3031)

In [8]:
# Split data
main_cols = train.columns.difference(['ID', 'order_id', 'rider_id', 'Rider ID', 'target', 'dispatch_time',	'client_id']).tolist()
X = train[main_cols]
y = train.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3031)

# Train a model
model = LGBMClassifier(random_state=3031)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Check score
accuracy_score(y_test, y_pred)

0.6144437649413466

In [93]:
y_pred = model.predict(X_test)

In [94]:
accuracy_score(y_test, y_pred)

0.6255629065436148

In [96]:
# Make predictions in test set and prepare submission file
predictions = model.predict(test)
sub_file = ss.copy()
sub_file.target = predictions
sub_file.to_csv('Baseline.csv', index = False)

In [97]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(iterations=2000, learning_rate=0.01, max_depth=7)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=200)

0:	learn: 1.0958024	test: 1.0957679	best: 1.0957679 (0)	total: 495ms	remaining: 16m 29s
200:	learn: 0.9327506	test: 0.9318528	best: 0.9318528 (200)	total: 35.9s	remaining: 5m 21s
400:	learn: 0.9037376	test: 0.9039521	best: 0.9039521 (400)	total: 1m 9s	remaining: 4m 35s
600:	learn: 0.8866441	test: 0.8880620	best: 0.8880620 (600)	total: 1m 41s	remaining: 3m 56s
800:	learn: 0.8739270	test: 0.8765248	best: 0.8765248 (800)	total: 2m 14s	remaining: 3m 21s
1000:	learn: 0.8638343	test: 0.8677257	best: 0.8677257 (1000)	total: 2m 47s	remaining: 2m 46s
1200:	learn: 0.8553160	test: 0.8605034	best: 0.8605034 (1200)	total: 3m 25s	remaining: 2m 16s
1400:	learn: 0.8477892	test: 0.8543650	best: 0.8543650 (1400)	total: 4m 2s	remaining: 1m 43s
1600:	learn: 0.8412418	test: 0.8493518	best: 0.8493518 (1600)	total: 4m 40s	remaining: 1m 9s
1800:	learn: 0.8353864	test: 0.8450704	best: 0.8450704 (1800)	total: 5m 13s	remaining: 34.7s
1999:	learn: 0.8300985	test: 0.8414504	best: 0.8414504 (1999)	total: 5m 47s	rem

<catboost.core.CatBoostClassifier at 0x19421b90190>

In [98]:
catboost_pred = cat_model.predict(X_test)

In [99]:
accuracy_score(y_test, catboost_pred)

0.6182798687941291

In [101]:
grb_pred = grb.predict(X_test)

In [102]:
accuracy_score(y_test, grb_pred)

0.6142213821093012

In [103]:
predictions = cat_model.predict(test)
sub_file = ss.copy()
sub_file.target = predictions
sub_file.to_csv('Catboost_Model.csv', index = False)

In [104]:
predictions = grb.predict(test)
sub_file = ss.copy()
sub_file.target = predictions
sub_file.to_csv('GRB_Model.csv', index = False)