# Cab Fare Prediction

In [1]:
#Load libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
os.chdir("C:\\Users\\SAGAR\\Downloads\\Data Science\\Cab_Fare_Prediction")

In [3]:
df=pd.read_csv("train_clean.csv")
df1=pd.read_csv("test_clean.csv")

In [4]:
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,H_Distance,Year,Month,Date,Day of Week,Hour
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1.0,1.030764,2009,6,15,0,17
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1.0,8.450134,2010,1,5,1,16
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2.0,1.389525,2011,8,18,3,0
3,7.7,-73.98376,40.758209,-73.985714,40.75814,1.0,0.164784,2012,4,21,5,4
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1.0,1.999157,2010,3,9,1,7


In [5]:
#Sepearating our training data into training and validation data

from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)

In [6]:
cols=['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','H_Distance','Year','Month','Day of Week','Hour','Date']

y=train['fare_amount']
valid=test['fare_amount']
X_train=train[cols]
X_test=test[cols]

In [7]:
#Defining a function for calculating error metrics 
import sklearn.metrics as metrics
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred)
    rmse=np.sqrt(mse)
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true))*100
    
    #calculating AIC
    resid = y_true - y_pred
    sse = sum(resid**2)
    k=11
    n=len(y_pred)

    AIC = 2*k + n*np.log(sse/n)
    
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MAPE: ', round(mape,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(rmse,4))
    print('AIC: ', round(AIC,4))

## Multiple Linear Regression
Creating a baseline multiple linear regression model 

In [41]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [42]:
#Prediction on train data
pred=model.predict(X_train)
regression_results(y,pred)

explained_variance:  0.7841
mean_squared_log_error:  0.0698
r2:  0.7841
MAE:  2.2746
MAPE:  22.1855
MSE:  17.9309
RMSE:  4.2345
AIC:  36865.5946


In [43]:
#Prediction on test data
pred=model.predict(X_test)
regression_results(valid,pred)

explained_variance:  0.816
mean_squared_log_error:  0.069
r2:  0.816
MAE:  2.3126
MAPE:  22.3736
MSE:  17.0545
RMSE:  4.1297
AIC:  9075.8371


The baseline RMSE we got is 4.2345 for train data and 4.1297 for validation data. Our aim is to build a model that has better RMSE score than this. Also, the variance between these two RMSEs matter and should be minimum, because if these are far apart, it would indicate overfitting of the model which we must avoid.

## Lasso Regression 
We will try Lasso regression to see if we get better results.

In [11]:
from sklearn.linear_model import Lasso
alphas =[1e-5,1e-3, 1e-2, 0.02, 0.04,0.08,0.1]

In [12]:
def error(y, y_pred):
    return np.sqrt(metrics.mean_squared_error(y,y_pred))

In [13]:
for alpha in alphas:
    lasso = Lasso(alpha = alpha)
    lasso.fit(X_train, y)
    y_train_pred = lasso.predict(X_train)
    rmse =error(y, y_train_pred)
    print("alpha : {%.5f} RMSE : {%.9f}" %(alpha,rmse))

alpha : {0.00001} RMSE : {4.234486650}
alpha : {0.00100} RMSE : {4.235201450}
alpha : {0.01000} RMSE : {4.264633136}
alpha : {0.02000} RMSE : {4.276417532}
alpha : {0.04000} RMSE : {4.276617069}
alpha : {0.08000} RMSE : {4.277088342}
alpha : {0.10000} RMSE : {4.277415902}


Using lasso regression is not feasible as alpha=1e-5 gives us least RMSE of 4.2344 which is almost as good as multiple linear regression model's RMSE according to above analysis.But if we use such a small value of alpha for lasso, it will reduce lasso to linear regression model (as alpha=0 for lasso gives linear regression)

Thus, we won't use Lasso for our further analysis.

## Decision Tree

In [14]:
from sklearn.tree import DecisionTreeRegressor

In [15]:
for i in range(1,50):
        reg = DecisionTreeRegressor(max_depth = i)
        reg.fit(X_train,y)
        pred3=reg.predict(X_train)
        error=np.sqrt(metrics.mean_squared_error(pred3,y))                
        print("max_depth : {%.5f} RMSE : {%.9f}" %(i,error))

max_depth : {1.00000} RMSE : {6.160608440}
max_depth : {2.00000} RMSE : {4.735452722}
max_depth : {3.00000} RMSE : {4.355844074}
max_depth : {4.00000} RMSE : {3.996920353}
max_depth : {5.00000} RMSE : {3.769246323}
max_depth : {6.00000} RMSE : {3.557881668}
max_depth : {7.00000} RMSE : {3.301452129}
max_depth : {8.00000} RMSE : {2.993678532}
max_depth : {9.00000} RMSE : {2.715696504}
max_depth : {10.00000} RMSE : {2.533465901}
max_depth : {11.00000} RMSE : {2.349116735}
max_depth : {12.00000} RMSE : {2.160502322}
max_depth : {13.00000} RMSE : {1.882338596}
max_depth : {14.00000} RMSE : {1.677890361}
max_depth : {15.00000} RMSE : {1.463058718}
max_depth : {16.00000} RMSE : {1.286234332}
max_depth : {17.00000} RMSE : {1.109640228}
max_depth : {18.00000} RMSE : {0.927789925}
max_depth : {19.00000} RMSE : {0.784919764}
max_depth : {20.00000} RMSE : {0.655052318}
max_depth : {21.00000} RMSE : {0.546507394}
max_depth : {22.00000} RMSE : {0.440270655}
max_depth : {23.00000} RMSE : {0.35028724

In [16]:
for i in range(1,50):
        reg = DecisionTreeRegressor(max_depth = i)
        reg.fit(X_train,y)
        pred3=reg.predict(X_test)
        error=np.sqrt(metrics.mean_squared_error(pred3,valid))                
        print("max_depth : {%.5f} RMSE : {%.9f}" %(i,error))

max_depth : {1.00000} RMSE : {6.125934374}
max_depth : {2.00000} RMSE : {4.698628091}
max_depth : {3.00000} RMSE : {4.424872229}
max_depth : {4.00000} RMSE : {4.395464899}
max_depth : {5.00000} RMSE : {4.392095178}
max_depth : {6.00000} RMSE : {4.340045058}
max_depth : {7.00000} RMSE : {4.461531575}
max_depth : {8.00000} RMSE : {4.454774274}
max_depth : {9.00000} RMSE : {4.878973168}
max_depth : {10.00000} RMSE : {4.891075709}
max_depth : {11.00000} RMSE : {5.067620985}
max_depth : {12.00000} RMSE : {4.941043935}
max_depth : {13.00000} RMSE : {5.186779060}
max_depth : {14.00000} RMSE : {5.280532345}
max_depth : {15.00000} RMSE : {5.347885037}
max_depth : {16.00000} RMSE : {5.420074458}
max_depth : {17.00000} RMSE : {5.589974681}
max_depth : {18.00000} RMSE : {5.535756966}
max_depth : {19.00000} RMSE : {5.389503830}
max_depth : {20.00000} RMSE : {5.491632379}
max_depth : {21.00000} RMSE : {5.543631717}
max_depth : {22.00000} RMSE : {5.370702771}
max_depth : {23.00000} RMSE : {5.47741079

We can see the RMSE value for prediction on training data goes down to 0 when max depth of the decision tree is more than 30. This means that our model with max depth 30 and above will exactly predict the training data i.e. overfitting. But,when we iterate our model on validation data, we see that it does not perform that well for max depth of 30.

The minimum RMSE we obtain for prediction on validation data is for max depth 6 which is 4.3400 which is slightly more than previous two models. Thus, we will not use this model.

## Random Forest
Let's try Random forst algorithm as it is based on ensemble method and we will see whether it gives better result than the decision tree.

In [17]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 200) #(n_estimators = 100)
rf.fit(X_train, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [18]:
#Prediction on train data
pred=rf.predict(X_train)
regression_results(y,pred)

explained_variance:  0.9758
mean_squared_log_error:  0.0104
r2:  0.9758
MAE:  0.7456
MAPE:  7.8455
MSE:  2.008
RMSE:  1.417
AIC:  8920.2752


In [19]:
#Prediction on test data
pred=rf.predict(X_test)
regression_results(valid,pred)

explained_variance:  0.8224
mean_squared_log_error:  0.0684
r2:  0.8224
MAE:  2.1168
MAPE:  22.4607
MSE:  16.4626
RMSE:  4.0574
AIC:  8963.0772


Although, the RMSE for train and validation data are the least of all above models.But,it is observed that RMSEs of train and validation data are way off. The variance is more than 2. This clearly indicates overfitting of the model and thus, we shall not use it.

## Light Gradient Boosting

In [20]:
import lightgbm as lgbm
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'verbose': 0,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
        'reg_aplha': 1,
        'reg_lambda': 0.001,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1     
    }

In [21]:
pred_test_y = np.zeros(X_test.shape[0])

In [22]:
train_set = lgbm.Dataset(X_train,y, silent=True)
model = lgbm.train(params, train_set = train_set, num_boost_round=300)

In [23]:
#On train data
pred_test_y = model.predict(X_train, num_iteration = model.best_iteration)
regression_results(pred_test_y,y)

explained_variance:  0.9105
mean_squared_log_error:  0.0374
r2:  0.9105
MAE:  1.5297
MAPE:  15.5951
MSE:  6.465
RMSE:  2.5426
AIC:  23844.705


In [24]:
#On test data
pred_test_y = model.predict(X_test, num_iteration = model.best_iteration)
regression_results(pred_test_y,valid)

explained_variance:  0.8189
mean_squared_log_error:  0.0612
r2:  0.8189
MAE:  1.9799
MAPE:  19.0218
MSE:  14.0545
RMSE:  3.7489
AIC:  8458.2802


Light gradient boosting model seems to perform better than all previous models with least RMSEs in both training and validation and variance between train and validation is 1.2. But before we fix this model for our test data, lets try XGBoost algorithm to see if it performs better than this.

## Xtreme Gradient Boosting

In [25]:
import xgboost as xgb 
dtrain = xgb.DMatrix(X_train, label=y)
dtest = xgb.DMatrix(X_test)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [45]:
params = {'max_depth':7,
          'silent':1,
          'objective':'reg:linear',
          'eval_metric':'rmse',
          'learning_rate':0.05
         }
num_rounds = 50

In [27]:
xb = xgb.train(params, dtrain, num_rounds)

In [28]:
#Prediction on train data
y_pred_xgb = xb.predict(dtrain)
regression_results(y_pred_xgb,y)

explained_variance:  0.8524
mean_squared_log_error:  0.0455
r2:  0.8402
MAE:  1.7686
MAPE:  18.5575
MSE:  9.5186
RMSE:  3.0852
AIC:  28782.496


In [29]:
#Prediction on test data
y_pred_xgb = xb.predict(dtest)
regression_results(y_pred_xgb,valid)

explained_variance:  0.729
mean_squared_log_error:  0.0679
r2:  0.7158
MAE:  2.1353
MAPE:  21.1107
MSE:  18.0905
RMSE:  4.2533
AIC:  9264.0709


This model gives us RMSE of 4.2533 for validation data which is slightly more than our baseline model of MLR and thus, we shall not use it. 

#### Out of all the models we have implemented, LGBoost  performed the best.  LGBoost provides with RMSE of 3.9813 which is better than any other model implemented.

#### Thus we shall select LGBoost for prediction of our test data. 

## Predicting test values with LGBoost

In [30]:
test_cab=df1.copy() 

In [31]:
X_train.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,H_Distance,Year,Month,Day of Week,Hour,Date
5635,-73.968423,40.755178,-73.96266,40.798307,5.0,4.820215,2011,4,2,23,27
5739,-73.950422,40.723102,-73.945342,40.721027,1.0,0.486323,2013,6,6,4,9
13180,-74.000806,40.730698,-73.978723,40.744827,1.0,2.435147,2014,4,3,19,3
11512,-73.94864,40.782205,-73.97284,40.759347,3.0,3.257802,2013,5,5,10,4
1236,-73.97683,40.762966,-73.974205,40.778412,1.0,1.731647,2015,3,2,15,18


In [32]:
test_cab.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,H_Distance,Year,Month,Day of Week,Hour,Date
0,-73.97332,40.763805,-73.98143,40.743835,1,2.323259,2015,1,1,13,27
1,-73.986862,40.719383,-73.998886,40.739201,1,2.425353,2015,1,1,13,27
2,-73.982524,40.75126,-73.979654,40.746139,1,0.618628,2011,10,5,11,8
3,-73.98116,40.767807,-73.990448,40.751635,1,1.961033,2012,12,5,21,1
4,-73.966046,40.789775,-73.988565,40.744427,1,5.387301,2012,12,5,21,1


In [33]:
import lightgbm as lgbm
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'verbose': 0,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
        'reg_aplha': 1,
        'reg_lambda': 0.001,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1     
    }

In [34]:
pred_test_y = np.zeros(X_test.shape[0])

In [35]:
train_set = lgbm.Dataset(X_train,y, silent=True)
model = lgbm.train(params, train_set = train_set, num_boost_round=300)

In [36]:
pred_test_y = model.predict(test_cab, num_iteration = model.best_iteration)

In [37]:
pred_test_y

array([10.70176578,  9.9577318 ,  5.36207275, ..., 51.1751039 ,
       21.49016281,  6.70014005])

In [38]:
test_cab['predicted_fare']=pred_test_y

In [39]:
test_cab.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,H_Distance,Year,Month,Day of Week,Hour,Date,predicted_fare
0,-73.97332,40.763805,-73.98143,40.743835,1,2.323259,2015,1,1,13,27,10.701766
1,-73.986862,40.719383,-73.998886,40.739201,1,2.425353,2015,1,1,13,27,9.957732
2,-73.982524,40.75126,-73.979654,40.746139,1,0.618628,2011,10,5,11,8,5.362073
3,-73.98116,40.767807,-73.990448,40.751635,1,1.961033,2012,12,5,21,1,8.560943
4,-73.966046,40.789775,-73.988565,40.744427,1,5.387301,2012,12,5,21,1,14.556343


In [44]:
test_cab.to_csv('Final Predictions.csv',index=False)

#### Thus, we have successfully predicted cab fares for our test dataset using Light Gradient Boosting Algorithm.