In [1]:
# For Loading and Manipulating data
import pandas as pd
import numpy as np

# To display all the columns and the rows ( regardless of their number or their width )
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# For splitting, encoding and scaling the data respectively
from sklearn.model_selection import train_test_split
from category_encoders import MEstimateEncoder

# Models
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# For visualization purposes
import matplotlib.pyplot as plt
import seaborn as sns

## Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import  r2_score
from time import time

%matplotlib inline

# To change the style of the plots ( so that we all can see the same thing :) )
plt.style.use('seaborn')

# To remove annoying warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('cleanest.csv')

In [3]:
df.head()

Unnamed: 0,Train,Departure_delay,Motifs,Arrival_delay,Holiday,Code_cir,Direction,Weekday_sin,Weekday_cos,Month_sin,Month_cos,Season_sin,Season_cos,Destination_Len,Nbr_Stops,dist_bet_Stops
0,803,0.0,Unknown,0.0,0,1.0,0,0.0,1.0,0.0,1.0,0.0,1.0,76.3,26.0,2.934615
1,805,5.0,Att rame,2.0,0,1.0,0,0.0,1.0,0.0,1.0,0.0,1.0,76.3,26.0,2.934615
2,807,0.0,Unknown,8.0,0,1.0,0,0.0,1.0,0.0,1.0,0.0,1.0,76.3,26.0,2.934615
3,809,25.0,Att rame,19.0,0,1.0,0,0.0,1.0,0.0,1.0,0.0,1.0,76.3,26.0,2.934615
4,811,0.0,Unknown,3.0,0,1.0,0,0.0,1.0,0.0,1.0,0.0,1.0,76.3,26.0,2.934615


#### 1- First let's split the feature ( X ) from the target ( Y )

In [4]:
x = df.drop(columns=['Arrival_delay'])
y = df['Arrival_delay']

In [5]:
df.shape, x.shape, y.shape

((9562, 16), (9562, 15), (9562,))

#### 2- Splitting the data to training and testing sets

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [7]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((7649, 15), (7649,), (1913, 15), (1913,))

In [8]:
x_train.head()

Unnamed: 0,Train,Departure_delay,Motifs,Holiday,Code_cir,Direction,Weekday_sin,Weekday_cos,Month_sin,Month_cos,Season_sin,Season_cos,Destination_Len,Nbr_Stops,dist_bet_Stops
9179,512079,22.0,Att rame,1,0.857143,0,0.0,1.0,-0.5,0.8660254,0.0,1.0,162.222,32.0,5.069438
3915,804,0.0,Unknown,1,1.0,1,-0.781831,0.62349,0.866025,-0.5,1.0,6.123234000000001e-17,76.3,26.0,2.934615
2577,513055,0.0,Unknown,1,1.0,0,-0.433884,-0.900969,1.0,6.123234000000001e-17,1.0,6.123234000000001e-17,269.249,41.0,6.567049
7653,818,0.0,Unknown,1,0.285714,1,-0.974928,-0.222521,-1.0,-1.83697e-16,-1.0,-1.83697e-16,76.3,26.0,2.934615
7167,5087,0.0,Unknown,1,1.0,1,0.0,1.0,-0.866025,-0.5,-1.0,-1.83697e-16,269.249,41.0,6.567049


_Target Encoding_

In [9]:
te = MEstimateEncoder()
x_train = te.fit_transform(x_train, y_train)
x_test  = te.transform(x_test)

In [10]:
x_train.head()

Unnamed: 0,Train,Departure_delay,Motifs,Holiday,Code_cir,Direction,Weekday_sin,Weekday_cos,Month_sin,Month_cos,Season_sin,Season_cos,Destination_Len,Nbr_Stops,dist_bet_Stops
9179,38.752982,22.0,53.33905,1,0.857143,0,0.0,1.0,-0.5,0.8660254,0.0,1.0,162.222,32.0,5.069438
3915,2.471044,0.0,16.69026,1,1.0,1,-0.781831,0.62349,0.866025,-0.5,1.0,6.123234000000001e-17,76.3,26.0,2.934615
2577,71.378324,0.0,16.69026,1,1.0,0,-0.433884,-0.900969,1.0,6.123234000000001e-17,1.0,6.123234000000001e-17,269.249,41.0,6.567049
7653,11.593571,0.0,16.69026,1,0.285714,1,-0.974928,-0.222521,-1.0,-1.83697e-16,-1.0,-1.83697e-16,76.3,26.0,2.934615
7167,58.577327,0.0,16.69026,1,1.0,1,0.0,1.0,-0.866025,-0.5,-1.0,-1.83697e-16,269.249,41.0,6.567049


#### 4- Building our Models

_Random Forest_

In [11]:
def Random_Forest(n_estimators, max_depth):
    # fitting
    start = time()
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    rf_model.fit(x_train, y_train)
    end = time()
    
    time_taken = end-start
    ## Testing
    results = []                # Saving it into variables to compare the results later
    
    rf_r2_score_train = rf_model.score(x_train, y_train)
    rf_r2_score_test = rf_model.score(x_test, y_test)

    print(f'Random Forest R-squared for the Training set: {rf_r2_score_train}')
    print(f'Random Forest R-squared for the Test set: {rf_r2_score_test}' )
    results.append(rf_r2_score_test)
    
    print('-'*80)
    
    pred = lambda x: rf_model.predict(x)
    
    rf_rmse_score_train = np.sqrt(mean_squared_error(y_train, pred(x_train)))
    rf_rmse_score_test  = np.sqrt(mean_squared_error(y_test, pred(x_test)))
    
    print(f'Random Forest RMSE for the Training set: {rf_rmse_score_train}')
    print(f'Random Forest RMSE for the Test set: {rf_rmse_score_test}' )
    results.append(rf_rmse_score_test)
    
    print('-'*80)
    
    rf_mae_score_train = mean_absolute_error(y_train, pred(x_train))
    rf_mae_score_test  = mean_absolute_error(y_test, pred(x_test))
    
    print(f'Random Forest MAE for the Training set : {rf_mae_score_train}')
    print(f'Random Forest MAE for the Test set: {rf_mae_score_test}' )
    results.append(rf_mae_score_test)
    
    print('='*80)
    
    return results, time_taken

In [12]:
%time rf_results, rf_time_taken = Random_Forest(30, 6)

Random Forest R-squared for the Training set: 0.8270156359664796
Random Forest R-squared for the Test set: 0.7499530221466921
--------------------------------------------------------------------------------
Random Forest RMSE for the Training set: 17.08648397482774
Random Forest RMSE for the Test set: 20.708377568904442
--------------------------------------------------------------------------------
Random Forest MAE for the Training set : 9.70364783662365
Random Forest MAE for the Test set: 10.561143404821529
Wall time: 695 ms


_SVM_

In [13]:
def SVM(C):
    # fitting
    start = time()
    svm_model = SVR(C=C)
    svm_model.fit(x_train, y_train)
    end = time()
    
   
    
    time_taken = end-start
    ## Testing
    results = []                # Saving it into variables to compare the results later
    
    svm_r2_score_train = svm_model.score(x_train, y_train)
    svm_r2_score_test = svm_model.score(x_test, y_test)

    print(f'SVM R-squared for the Training set: {svm_r2_score_train}')
    print(f'SVM R-squared for the Test set: {svm_r2_score_test}' )
    results.append(svm_r2_score_test)
    
    print('-'*80)
    
    pred = lambda x: svm_model.predict(x)
    
    svm_rmse_score_train = np.sqrt(mean_squared_error(y_train, pred(x_train)))
    svm_rmse_score_test  = np.sqrt(mean_squared_error(y_test, pred(x_test)))
    
    print(f'SVM RMSE for the Training set: {svm_rmse_score_train}')
    print(f'SVM RMSE for the Test set: {svm_rmse_score_test}' )
    results.append(svm_rmse_score_test)
    
    print('-'*80)
    
    svm_mae_score_train = mean_absolute_error(y_train, pred(x_train))
    svm_mae_score_test  = mean_absolute_error(y_test, pred(x_test))
    
    print(f'SVM MAE for the Training set : {svm_mae_score_train}')
    print(f'SVM MAE for the Test set: {svm_mae_score_test}' )
    results.append(svm_mae_score_test)
    
    print('='*80)
    
    return results, time_taken

In [14]:
%time svm_results, svm_time_taken = SVM(100)

SVM R-squared for the Training set: 0.7800396986965978
SVM R-squared for the Test set: 0.7269131961201138
--------------------------------------------------------------------------------
SVM RMSE for the Training set: 19.26732539182375
SVM RMSE for the Test set: 21.64141369797655
--------------------------------------------------------------------------------
SVM MAE for the Training set : 9.969347433349283
SVM MAE for the Test set: 10.497192664883988
Wall time: 56 s


_XGB Regressor_

In [15]:
def XGBoost(learning_rate, num_leaves, n_estimators):
    # Fitting
    start = time()
    xgb_model = XGBRegressor(learning_rate=learning_rate, num_leaves=num_leaves, n_estimators=n_estimators)
    xgb_model.fit(x_train,y_train)
    end = time()
    
    
    
    time_taken = end-start
    ## Testing
    results = []                # Saving it into variables to compare the results later
    
    xgb_r2_score_train = xgb_model.score(x_train, y_train)
    xgb_r2_score_test = xgb_model.score(x_test, y_test)

    print(f'XGBoost R-squared for the Training set: {xgb_r2_score_train}')
    print(f'XGBoost R-squared for the Test set: {xgb_r2_score_test}' )
    results.append(xgb_r2_score_test)
    
    print('-'*80)
    
    pred = lambda x: xgb_model.predict(x)
    
    xgb_rmse_score_train = np.sqrt(mean_squared_error(y_train, pred(x_train)))
    xgb_rmse_score_test  = np.sqrt(mean_squared_error(y_test, pred(x_test)))
    
    print(f'XGBoost RMSE for the Training set: {xgb_rmse_score_train}')
    print(f'XGBoost RMSE for the Test set: {xgb_rmse_score_test}' )
    results.append(xgb_rmse_score_test)
    
    print('-'*80)
    
    xgb_mae_score_train = mean_absolute_error(y_train, pred(x_train))
    xgb_mae_score_test  = mean_absolute_error(y_test, pred(x_test))
    
    print(f'XGBoost MAE for the Training set : {xgb_mae_score_train}')
    print(f'XGBoost MAE for the Test set: {xgb_mae_score_test}' )
    results.append(xgb_mae_score_test)
    
    print('='*80)
    
    return results, time_taken

In [16]:
%time xgb_results, xgb_time_taken = XGBoost(0.1, 10, 75)

Parameters: { num_leaves } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


XGBoost R-squared for the Training set: 0.8770834194096064
XGBoost R-squared for the Test set: 0.7363096152646986
--------------------------------------------------------------------------------
XGBoost RMSE for the Training set: 14.403051515642627
XGBoost RMSE for the Test set: 21.265833864948597
--------------------------------------------------------------------------------
XGBoost MAE for the Training set : 8.186686747080985
XGBoost MAE for the Test set: 10.26734213790405
Wall time: 574 ms


_LightGBM_

In [17]:
def LightGBM(learning_rate, num_leaves, n_estimators):
    # Fitting
    start = time()
    lgbm_model = LGBMRegressor(learning_rate=learning_rate, num_leaves=num_leaves, n_estimators=n_estimators)
    lgbm_model.fit(x_train,y_train)
    end = time()
    
    
    
    time_taken = end-start
    ## Testing
    results = []                # Saving it into variables to compare the results later
    
    lgbm_r2_score_train = lgbm_model.score(x_train, y_train)
    lgbm_r2_score_test = lgbm_model.score(x_test, y_test)

    print(f'LightGBM R-squared for the Training set: {lgbm_r2_score_train}')
    print(f'LightGBM R-squared for the Test set: {lgbm_r2_score_test}' )
    results.append(lgbm_r2_score_test)
    
    print('-'*80)
    
    pred = lambda x: lgbm_model.predict(x)
    
    lgbm_rmse_score_train = np.sqrt(mean_squared_error(y_train, pred(x_train)))
    lgbm_rmse_score_test  = np.sqrt(mean_squared_error(y_test, pred(x_test)))
    
    print(f'LightGBM RMSE for the Training set: {lgbm_rmse_score_train}')
    print(f'LightGBM RMSE for the Test set: {lgbm_rmse_score_test}' )
    results.append(lgbm_rmse_score_test)
    
    print('-'*80)
    
    lgbm_mae_score_train = mean_absolute_error(y_train, pred(x_train))
    lgbm_mae_score_test  = mean_absolute_error(y_test, pred(x_test))
    
    print(f'LightGBM MAE for the Training set : {lgbm_mae_score_train}')
    print(f'LightGBM MAE for the Test set: {lgbm_mae_score_test}' )
    results.append(lgbm_mae_score_test)
    
    print('='*80)
    
    return results, time_taken

In [18]:
%time lgbm_results, lgbm_time_taken = LightGBM(0.1, 10, 75)

LightGBM R-squared for the Training set: 0.8269436619348141
LightGBM R-squared for the Test set: 0.7562678934803143
--------------------------------------------------------------------------------
LightGBM RMSE for the Training set: 17.090038212959044
LightGBM RMSE for the Test set: 20.44521306376
--------------------------------------------------------------------------------
LightGBM MAE for the Training set : 9.454758692617114
LightGBM MAE for the Test set: 10.183918505802334
Wall time: 242 ms


# Conclusion 

In [19]:
import joblib

In [20]:
ann_r2_score_test   = joblib.load('ann_r2_score_test.sav')
ann_rmse_score_test = joblib.load('ann_rmse_score_test.sav')
ann_mae_score_test  = joblib.load('ann_mae_score_test.sav')
ann_time_taken      = joblib.load('ann_time_taken.sav')

ann_results = [ann_r2_score_test, ann_rmse_score_test, ann_mae_score_test]

In [21]:
Regressors = ["SVM", "RF", "LGBM",  "XGB", "ANN"]

results_array    = np.array([svm_results, rf_results, lgbm_results,  xgb_results, ann_results])

time_taken_array = np.array([svm_time_taken, rf_time_taken, lgbm_time_taken,  xgb_time_taken, ann_time_taken])

results_time_df = pd.DataFrame({'R-Squared':results_array[:, 0], 'RMSE':results_array[:, 1], 'MAE':results_array[:, 2],
                                'Time Taken':time_taken_array}, index=Regressors)

# Modifying the scale of the first column
results_time_df['R-Squared'] = results_time_df['R-Squared']*100

results_time_df.to_csv('results_10.csv')