In [3]:
#importing packages and data
import numpy as np
import pandas as pd
import geopy.distance
df=pd.read_csv('rides.csv', parse_dates=['request_ts', 'accept_ts', 'pickup_ts', 'dropoff_ts', 'cancel_ts'])

In [4]:
df.head()

Unnamed: 0,ride_id,user_id,driver_id,request_ts,accept_ts,pickup_location,dropoff_location,pickup_ts,dropoff_ts,cancel_ts,status,statuscom
0,3000023,106891,105286.0,2021-05-27 19:38:00,2021-05-27 19:40:00,40.6851859 -73.99472165,40.83142658 -73.91271123,2021-05-27 19:48:00,2021-05-27 21:10:00,NaT,completed,completed
1,3000024,116375,,2021-12-05 00:02:00,NaT,40.81098464 -74.11502434,40.80982049 -73.80320195,NaT,NaT,2021-12-05 00:15:00,requested,completed
2,3000025,104571,109087.0,2021-07-09 09:06:00,2021-07-09 09:16:00,40.84414807 -73.84599412,40.8662361 -73.97788948,2021-07-09 09:28:00,2021-07-09 09:55:00,NaT,completed,completed
3,3000026,109497,,2021-07-19 17:03:00,NaT,40.6581083 -73.90199317,40.7820038 -74.1057497,NaT,NaT,2021-07-19 17:08:00,requested,completed
4,3000288,116687,,2021-12-12 08:57:00,NaT,40.76639545 -73.877075,40.67157145 -73.88681784,NaT,NaT,2021-12-12 09:05:00,requested,completed


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 385477 entries, 0 to 385476
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ride_id           385477 non-null  int64         
 1   user_id           385477 non-null  int64         
 2   driver_id         248379 non-null  float64       
 3   request_ts        385477 non-null  datetime64[ns]
 4   accept_ts         248379 non-null  datetime64[ns]
 5   pickup_location   385477 non-null  object        
 6   dropoff_location  385477 non-null  object        
 7   pickup_ts         223652 non-null  datetime64[ns]
 8   dropoff_ts        223652 non-null  datetime64[ns]
 9   cancel_ts         161825 non-null  datetime64[ns]
 10  status            385477 non-null  object        
 11  statuscom         385477 non-null  object        
dtypes: datetime64[ns](5), float64(1), int64(2), object(4)
memory usage: 35.3+ MB


In [6]:
#encode the target variable
mask = df['accept_ts'].isna()
df.loc[mask, 'accepted'] = 0
df.loc[~mask, 'accepted'] = 1

In [7]:
#split long and lat and convert to float
df[['pickup_lat', 'pickup_long']]=df['pickup_location'].str.split(' -', expand=True).astype('float')
df[['dropoff_lat', 'dropoff_long']]=df['dropoff_location'].str.split(' -', expand=True).astype('float')

In [8]:
#calculate distances
distance = []
for row in df.itertuples(index=False):
    distance.append(geopy.distance.geodesic((row.pickup_lat, row.pickup_long),(row.dropoff_lat, row.dropoff_long)).km)
df['distance']= distance

In [9]:
#extract months, dow, hour from request time
#try df['request_ts'].dt.month
df['month'] = df['request_ts'].dt.month
df['day_of_week'] = df['request_ts'].dt.day_of_week
df['hour'] = df['request_ts'].dt.hour

In [10]:
#drop unneeded columns, create X and y
df_clean = df.drop(['driver_id', 'statuscom', 'pickup_ts', 'dropoff_ts', 'cancel_ts', 'accept_ts', 'status', 'pickup_location', 'dropoff_location', 'request_ts'], axis=1)
y= df_clean['accepted']
X= df_clean.loc[:, df_clean.columns != 'accepted']

In [11]:
#create holdout set grouping by user_id given that we have repeated observations, in order to avoid leakage
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=.20, n_splits=1, random_state = 8)
split = splitter.split(X, y, groups=X['user_id'])
train_inds, test_inds= next(split)

X_train = X.iloc[train_inds]
X_test = X.iloc[test_inds]
y_train = y.iloc[train_inds]
y_test = y.iloc[test_inds]

In [12]:
#create a group filter for further splitting, drop user and ride id columns
group = X_train['user_id']
X_train = X_train.drop(columns=['ride_id', 'user_id'], axis=1)
X_test= X_test.drop(columns=['ride_id', 'user_id'], axis=1)

In [12]:
#knn model, takes several minutes to complete
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
knn = KNeighborsClassifier()
scaler = MinMaxScaler()
steps = [('scaler', scaler), ('knn', knn)]
pipeline = Pipeline(steps=steps)
param_distributions={'knn__n_neighbors':np.arange(3, 31), 'knn__weights':(['uniform', 'distance']), 'knn__algorithm':(['auto', 'ball_tree']) }
cv= GroupShuffleSplit(test_size=0.2, n_splits=5, random_state=8)
knn_cv = RandomizedSearchCV(estimator=pipeline, param_distributions=param_distributions, cv=cv, random_state=8,refit=True, n_jobs=-1)
knn_cv.fit(X_train, y_train, groups=group) 
y_pred = knn_cv.predict(X_test)
print(knn_cv.best_estimator_)
print(knn_cv.best_params_)
print(knn_cv.best_score_)

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('knn',
                 KNeighborsClassifier(n_neighbors=30, weights='distance'))])
{'knn__weights': 'distance', 'knn__n_neighbors': 30, 'knn__algorithm': 'auto'}
0.6395640861436934


In [47]:
#confusion matrix
from sklearn.metrics import  confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 4309 23589]
 [ 4413 44985]]
              precision    recall  f1-score   support

         0.0       0.49      0.15      0.24     27898
         1.0       0.66      0.91      0.76     49398

    accuracy                           0.64     77296
   macro avg       0.58      0.53      0.50     77296
weighted avg       0.60      0.64      0.57     77296



In [48]:
#decision tree classifier
from sklearn.tree import DecisionTreeClassifier
cv= GroupShuffleSplit(test_size=0.2, n_splits=5, random_state=8)
param_distributions={'max_depth':np.arange(3, 30), 'criterion':(['gini', 'entropy', 'log_loss']), 'random_state':[8], 'min_samples_leaf':np.arange(2, 10) }
dt = DecisionTreeClassifier()
dt_cv = RandomizedSearchCV(estimator=dt, param_distributions=param_distributions, cv=cv, random_state=8,refit=True)
dt_cv.fit(X_train, y_train, groups=group)
y_pred = dt_cv.predict(X_test)
print(dt_cv.best_estimator_)
print(dt_cv.best_params_)
print(dt_cv.best_score_)
print(dt_cv.best_estimator_.feature_importances_)

DecisionTreeClassifier(criterion='log_loss', max_depth=6, min_samples_leaf=5,
                       random_state=8)
{'random_state': 8, 'min_samples_leaf': 5, 'max_depth': 6, 'criterion': 'log_loss'}
0.6549000648156054
[8.36296449e-03 2.08060697e-03 6.37792768e-03 3.09280768e-03
 1.40852692e-02 9.63896273e-01 1.16860838e-03 9.35542618e-04]


In [49]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 4524 23374]
 [ 3105 46293]]
              precision    recall  f1-score   support

         0.0       0.59      0.16      0.25     27898
         1.0       0.66      0.94      0.78     49398

    accuracy                           0.66     77296
   macro avg       0.63      0.55      0.52     77296
weighted avg       0.64      0.66      0.59     77296



In [50]:
#percentage of accepted requests by month
df.groupby('month')['accepted'].mean()

month
1     0.720027
2     0.617181
3     0.444691
4     0.434475
5     0.508081
6     0.571565
7     0.615088
8     0.650815
9     0.671278
10    0.702547
11    0.714486
12    0.732870
Name: accepted, dtype: float64

In [58]:
#random forest
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(n_estimators=500, criterion= 'log_loss', max_depth=6,  random_state=8, bootstrap=False, min_samples_leaf=5, max_features='sqrt')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf.score(X_test, y_test)

0.6390757607120678

In [68]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[    0 27898]
 [    0 49398]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00     27898
         1.0       0.64      1.00      0.78     49398

    accuracy                           0.64     77296
   macro avg       0.32      0.50      0.39     77296
weighted avg       0.41      0.64      0.50     77296



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
