In [1]:
import numpy as np
import pandas as pd

In [2]:
cab_df=pd.read_csv("cab_rides.csv",delimiter=',')
weather_df=pd.read_csv("weather.csv",delimiter=',')

In [6]:
cab_df['date_time']=pd.to_datetime(cab_df['time_stamp']/1000,unit='s')
weather_df['date_time']=pd.to_datetime(weather_df['time_stamp'],unit='s')
cab_df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date_time
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,2018-12-16 09:30:07.890000128
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,2018-11-27 02:00:23.676999936
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,2018-11-28 01:00:22.197999872
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,2018-11-30 04:53:02.749000192
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,2018-11-29 03:49:20.223000064


## Data Preprocessing

In [10]:
cab_df['merge_date'] = cab_df.source.astype(str) +" - "+ cab_df.date_time.dt.date.astype("str") +" - "+ cab_df.date_time.dt.hour.astype("str")
weather_df['merge_date'] = weather_df.location.astype(str) +" - "+ weather_df.date_time.dt.date.astype("str") +" - "+ weather_df.date_time.dt.hour.astype("str")

In [11]:
weather_df.index=weather_df['merge_date']

In [12]:
final_dataframe=cab_df.join(weather_df,on=['merge_date'],rsuffix='_w')
final_dataframe=final_dataframe.dropna(axis=0)

In [13]:
final_dataframe['day'] = final_dataframe.date_time.dt.dayofweek
final_dataframe['hour'] = final_dataframe.date_time.dt.hour

In [14]:
surge_dataframe=final_dataframe[final_dataframe.surge_multiplier<3]

## Feature Selection

In [15]:
x=surge_dataframe[['day','hour','temp','clouds', 'pressure','humidity', 'wind', 'rain']]
y=surge_dataframe['surge_multiplier']

## Label Encoding and Data Splitting

In [16]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
le=LabelEncoder()
le.fit([1,1.25,1.5,1.75,2.,2.25,2.5])
y=le.transform(y)

feature_list=list(x.columns)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

## Feature Sampling

In [17]:
# Before SMOTE
unique,counts=np.unique(y_train,return_counts=True)
print(dict(zip(unique,counts)))

{0: 129210, 1: 2155, 2: 976, 3: 438, 4: 410, 6: 24}


In [18]:
# After SMOTE
from imblearn.over_sampling import SMOTE
sm=SMOTE(random_state=42)
train_features,train_labels=sm.fit_resample(x_train,y_train)

## Model Training

In [19]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_jobs=-1,random_state=42,class_weight="balanced")
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

## Feature Importance

In [20]:
importances=list(model.feature_importances_)
feature_importances=[(feature,round(importance,2)) for feature,importance in zip(feature_list,importances)]
feature_importances=sorted(feature_importances,key=lambda x:x[1],reverse=True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable: wind                 Importance: 0.22
Variable: temp                 Importance: 0.21
Variable: pressure             Importance: 0.18
Variable: rain                 Importance: 0.17
Variable: humidity             Importance: 0.08
Variable: hour                 Importance: 0.07
Variable: clouds               Importance: 0.05
Variable: day                  Importance: 0.02


[None, None, None, None, None, None, None, None]

## Evaluation of the built model

In [30]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [31]:
accuracy_score(y_test,y_pred)

0.42452532754151195

In [26]:
f1_score(y_test,y_pred,average='weighted')

0.5780149048458073

In [27]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, y_pred, average="weighted")

0.9589943499435803

In [29]:
recall_score(y_test, y_pred, average="micro")

0.42452532754151195

In [33]:
pd.crosstab(le.inverse_transform(y_test), le.inverse_transform(y_pred),rownames=['Actual'],colnames=['Predicted'])

Predicted,1.00,1.25,1.50,1.75,2.00,2.50
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,23542,10482,7398,7044,5810,1019
1.25,175,323,156,162,120,31
1.5,43,39,177,88,46,14
1.75,25,24,23,107,50,5
2.0,9,23,14,28,84,22
2.5,1,0,0,2,2,4
