In [1]:
# For Loading and Manipulating data
import pandas as pd
import numpy as np

# To display all the columns and the rows ( regardless of their number or their width )
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# For splitting, encoding and scaling the data respectively
from sklearn.model_selection import train_test_split
from category_encoders import MEstimateEncoder, TargetEncoder

# Models
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor

# For visualization purposes
import matplotlib.pyplot as plt
import seaborn as sns

## Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, classification_report 
from time import time

%matplotlib inline

# To change the style of the plots ( so that we all can see the same thing :) )
plt.style.use('seaborn')

# To remove annoying warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('cleanest.csv')

In [3]:
df.head()

Unnamed: 0,Train,Departure_delay,Arriving_delay,Interval,Code_cir,Holiday_Yes,Weekday_Mon,Weekday_Sat,Weekday_Sun,Weekday_Thur,Weekday_Tues,Weekday_Wed,Month_Aug,Month_Dec,Month_Feb,Month_Jan,Month_Jul,Month_Jun,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,Season_v2_Spring,Season_v2_Summer,Season_v2_Winter,Destination_Len,Nbr_Stops,dist_bet_Stops,Direction_Odd
0,803,0.0,0.0,0_5,1.0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,76.3,26.0,2.934615,1
1,805,5.0,2.0,0_5,1.0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,76.3,26.0,2.934615,1
2,807,0.0,8.0,6_15,1.0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,76.3,26.0,2.934615,1
3,809,25.0,19.0,16_30,1.0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,76.3,26.0,2.934615,1
4,811,0.0,3.0,0_5,1.0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,76.3,26.0,2.934615,1


In [4]:
intervals_num = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
intervals_cat = ['6_10', '11_15', '16_20', '21_25', '26_30', '31_35', '36_40', '41_45', '46_50', '51_55', '56_60']

for i in range(1, 12):
    mask = (df['Arriving_delay'] >= intervals_num[i-1]+1)&(df['Arriving_delay'] <= intervals_num[i])
    df.loc[mask, 'Interval'] = intervals_cat[i-1]

In [5]:
df['Interval'].unique()

array(['0_5', '6_10', '16_20', '11_15', '>60', '51_55', '56_60', '21_25',
       '31_35', '26_30', '41_45', '36_40', '46_50'], dtype=object)

_Classification Model_

In [6]:
x_class = df.drop(columns=['Interval', 'Arriving_delay'])
y_class, _ = df['Interval'].factorize()

te = TargetEncoder()
x_class = te.fit_transform(x_class, y_class)

lgbm_model = LGBMClassifier(max_depth=10, learning_rate=0.35, num_leaves=60)
lgbm_model.fit(x_class,y_class)

new_feature = lgbm_model.predict(x_class)

In [7]:
accuracy_score(y_class, new_feature)

0.7660246289864224

_Regression Model_

In [8]:
x = df.drop(columns=['Arriving_delay'])
x['Interval'] = new_feature
y = df['Arriving_delay']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

te = MEstimateEncoder()
x_train = te.fit_transform(x_train, y_train)
x_test  = te.transform(x_test)

#### 4- Building our Models

_LightGBM_

In [9]:
def LightGBM():
    # Fitting
    start = time()
    lgbm_model = LGBMRegressor(max_depth=5, learning_rate=0.1, num_leaves=20)
    lgbm_model.fit(x_train,y_train)
    end = time()
    
    
    
    time_taken = end-start
    ## Testing
    results = []                # Saving it into variables to compare the results later
    
    lgbm_r2_score_train = lgbm_model.score(x_train, y_train)
    lgbm_r2_score_test = lgbm_model.score(x_test, y_test)

    print(f'LightGBM R-squared for the Training set: {lgbm_r2_score_train}')
    print(f'LightGBM R-squared for the Test set: {lgbm_r2_score_test}' )
    results.append(lgbm_r2_score_test)
    
    print('-'*80)
    
    pred = lambda x: lgbm_model.predict(x)
    
    lgbm_rmse_score_train = np.sqrt(mean_squared_error(y_train, pred(x_train)))
    lgbm_rmse_score_test  = np.sqrt(mean_squared_error(y_test, pred(x_test)))
    
    print(f'LightGBM RMSE for the Training set: {lgbm_rmse_score_train}')
    print(f'LightGBM RMSE for the Test set: {lgbm_rmse_score_test}' )
    results.append(lgbm_rmse_score_test)
    
    print('-'*80)
    
    lgbm_mae_score_train = mean_absolute_error(y_train, pred(x_train))
    lgbm_mae_score_test  = mean_absolute_error(y_test, pred(x_test))
    
    print(f'LightGBM MAE for the Training set : {lgbm_mae_score_train}')
    print(f'LightGBM MAE for the Test set: {lgbm_mae_score_test}' )
    results.append(lgbm_mae_score_test)
    
    print('='*80)
    
    return results, time_taken

In [10]:
%time lgbm_results, lgbm_time_taken = LightGBM()

LightGBM R-squared for the Training set: 0.8672014961119674
LightGBM R-squared for the Test set: 0.8461508771258692
--------------------------------------------------------------------------------
LightGBM RMSE for the Training set: 12.74514905942148
LightGBM RMSE for the Test set: 13.433966253682245
--------------------------------------------------------------------------------
LightGBM MAE for the Training set : 6.485290801824094
LightGBM MAE for the Test set: 6.677781371491487
Wall time: 322 ms
