In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

script_dir = os.path.abspath('')
relative_path = '../dataset/retard_ferie_vacation_greves.csv'
csv_path = os.path.join(script_dir, relative_path)
data = pd.read_csv(csv_path, delimiter=",")

# Convert the 'date' column to a datetime object
data['date'] = pd.to_datetime(data['date'])

# Data Prepocessing

In [13]:
# Deal with the date
# We want to extract the year and the month from the date column
# This will help to see patterns in the data according to the month or the year

data = data.sort_values(by='date') # Sort the dataframe by date

data['annee'] = data['date'].dt.year
data['mois'] = data['date'].dt.month

# We want to drop the comments column because it is not relevant for our model (as seen in data exploration)
comments_columns = ['commentaire_annulation', 'commentaire_retards_depart', 'commentaires_retard_arrivee']
data.drop(columns=comments_columns, inplace=True)

data.head()

Unnamed: 0,date,service,gare_depart,gare_arrivee,duree_moyenne,nb_train_prevu,nb_annulation,nb_train_depart_retard,retard_moyen_depart,retard_moyen_tous_trains_depart,...,prct_cause_prise_en_charge_voyageurs,annee,mois,nombre_jour_ferie,jours_vacances,total_jours,ratio_vacances,Nombre de grèves,Motif exprimé,Ratio taux gréviste
0,2018-01-01,National,BORDEAUX ST JEAN,PARIS MONTPARNASSE,141,870,5,289,11.247809,3.693179,...,0.840336,2018,1,1.0,24.0,31.0,0.774194,0.0,0,0.0
95,2018-01-01,National,LYON PART DIEU,MARSEILLE ST CHARLES,107,539,7,271,16.212423,8.403415,...,5.333333,2018,1,1.0,24.0,31.0,0.774194,0.0,0,0.0
94,2018-01-01,National,AVIGNON TGV,PARIS LYON,159,538,0,109,11.687309,2.113476,...,3.529412,2018,1,1.0,24.0,31.0,0.774194,0.0,0,0.0
93,2018-01-01,National,ANNECY,PARIS LYON,224,198,0,12,8.070833,0.489141,...,4.761905,2018,1,1.0,24.0,31.0,0.774194,0.0,0,0.0
92,2018-01-01,National,MARSEILLE ST CHARLES,TOURCOING,299,31,0,13,10.769231,4.334946,...,0.0,2018,1,1.0,24.0,31.0,0.774194,0.0,0,0.0


In [14]:
# There are columns that depends on the variable that we want to predict.
# It has no sense to use them for prediction because for the future, we will not have access to it
columns_forbiden = [
    # 'retard_moyen_depart',
    # 'retard_moyen_tous_trains_depart',
    'nb_train_retard_arrivee',
    # 'retard_moyen_arrivee', # this is the feature we try to predict so we will remove it later
    'retard_moyen_tous_trains_arrivee',
    'nb_train_retard_sup_15',
    'retard_moyen_trains_retard_sup15',
    'nb_train_retard_sup_30',
    'nb_train_retard_sup_60',
    'prct_cause_externe',
    'prct_cause_infra',
    'prct_cause_gestion_trafic',
    'prct_cause_materiel_roulant',
    'prct_cause_gestion_gare',
    'prct_cause_prise_en_charge_voyageurs',
    'Motif exprimé',
]

# drop these columns
data.drop(columns=columns_forbiden, inplace=True)
data.head()

Unnamed: 0,date,service,gare_depart,gare_arrivee,duree_moyenne,nb_train_prevu,nb_annulation,nb_train_depart_retard,retard_moyen_depart,retard_moyen_tous_trains_depart,retard_moyen_arrivee,annee,mois,nombre_jour_ferie,jours_vacances,total_jours,ratio_vacances,Nombre de grèves,Ratio taux gréviste
0,2018-01-01,National,BORDEAUX ST JEAN,PARIS MONTPARNASSE,141,870,5,289,11.247809,3.693179,28.436735,2018,1,1.0,24.0,31.0,0.774194,0.0,0.0
95,2018-01-01,National,LYON PART DIEU,MARSEILLE ST CHARLES,107,539,7,271,16.212423,8.403415,39.738889,2018,1,1.0,24.0,31.0,0.774194,0.0,0.0
94,2018-01-01,National,AVIGNON TGV,PARIS LYON,159,538,0,109,11.687309,2.113476,31.968561,2018,1,1.0,24.0,31.0,0.774194,0.0,0.0
93,2018-01-01,National,ANNECY,PARIS LYON,224,198,0,12,8.070833,0.489141,37.246053,2018,1,1.0,24.0,31.0,0.774194,0.0,0.0
92,2018-01-01,National,MARSEILLE ST CHARLES,TOURCOING,299,31,0,13,10.769231,4.334946,88.333333,2018,1,1.0,24.0,31.0,0.774194,0.0,0.0


In [15]:
# We split the data into a training set and a test set (last six months as test set)
train_data = data[data['date'] < data['date'].max() - pd.DateOffset(months=6)]
test_data = data[data['date'] >= data['date'].max() - pd.DateOffset(months=6)]

# We don't need the date column anymore
train_data = train_data.drop(['date'], axis=1)
test_data = test_data.drop(['date'], axis=1)

In [22]:
# Create a pipeline on the numeric features of the dataset
# We use the StandardScaler to normalize the data
# We use the SimpleImputer to replace missing values by the mean of the column
# We use the PolynomialFeatures to create new features
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer

num_pipeline = Pipeline([
        # apply absolute value to all the columns (because we are interested in the delay and not the advance)
        ('abs_transformer', FunctionTransformer(np.abs, validate=True)),
        ('imputer', SimpleImputer(strategy="mean")), #We don't really need this because there is no missing values (see data exploration)
        ('std_scaler', StandardScaler()),
        ('poly_features', PolynomialFeatures(degree=3, include_bias=False))
    ])

In [23]:
# We create a pipeline for the categorical features
# We use the OneHotEncoder to encode the categorical features
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['gare_depart', 'gare_arrivee', 'service']
numeric_features = list(train_data.drop(columns=categorical_features).columns)


data_transformer = ColumnTransformer([
    ("num", num_pipeline, numeric_features),
    # ("cat", OneHotEncoder(), categorical_features)
])

categorical_features = ['gare_depart', 'gare_arrivee', 'service']

encoder = OneHotEncoder(drop='first', sparse=False)

train_encoded = encoder.fit_transform(train_data[categorical_features])
test_encoded = encoder.transform(test_data[categorical_features])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_features)
, index=train_data.index)
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_features)
, index=test_data.index)

# Drop original categorical columns and concatenate the one-hot encoded columns
train_data_prepared = pd.concat([train_data.drop(columns=['gare_arrivee', 'gare_depart', 'service']), train_encoded_df], axis=1)
test_data_prepared = pd.concat([test_data.drop(columns=['gare_arrivee', 'gare_depart', 'service']), test_encoded_df], axis=1)

train_data.head()



Unnamed: 0,service,gare_depart,gare_arrivee,duree_moyenne,nb_train_prevu,nb_annulation,nb_train_depart_retard,retard_moyen_depart,retard_moyen_tous_trains_depart,retard_moyen_arrivee,annee,mois,nombre_jour_ferie,jours_vacances,total_jours,ratio_vacances,Nombre de grèves,Ratio taux gréviste
0,National,BORDEAUX ST JEAN,PARIS MONTPARNASSE,141,870,5,289,11.247809,3.693179,28.436735,2018,1,1.0,24.0,31.0,0.774194,0.0,0.0
95,National,LYON PART DIEU,MARSEILLE ST CHARLES,107,539,7,271,16.212423,8.403415,39.738889,2018,1,1.0,24.0,31.0,0.774194,0.0,0.0
94,National,AVIGNON TGV,PARIS LYON,159,538,0,109,11.687309,2.113476,31.968561,2018,1,1.0,24.0,31.0,0.774194,0.0,0.0
93,National,ANNECY,PARIS LYON,224,198,0,12,8.070833,0.489141,37.246053,2018,1,1.0,24.0,31.0,0.774194,0.0,0.0
92,National,MARSEILLE ST CHARLES,TOURCOING,299,31,0,13,10.769231,4.334946,88.333333,2018,1,1.0,24.0,31.0,0.774194,0.0,0.0


In [24]:
# We prepare the training and test sets
x_train = train_data_prepared.drop("retard_moyen_arrivee", axis=1)
y_train = train_data_prepared["retard_moyen_arrivee"].copy()

x_test = test_data_prepared.drop("retard_moyen_arrivee", axis=1)
y_test = test_data_prepared["retard_moyen_arrivee"].copy()

In [37]:
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# The models that we try
rf_model = RandomForestRegressor()
bagging_model = BaggingRegressor()
gradient_boosting_model = GradientBoostingRegressor(max_depth=5)
lasso_model = Lasso(alpha=0.1)
knn_model = KNeighborsRegressor()

models = [rf_model, bagging_model, gradient_boosting_model, lasso_model, knn_model]
model_names = ['Random Forest', 'Bagging', 'Gradient Boosting', 'Lasso', 'KNN']
model_metrics = {
    'Model': model_names,
    'MAE': [],
    'MSE': [],
    'RMSE': [],
    'R2': [],
    'Precision %': []
}

# Evaluate and compare the models
for model, name in zip(models, model_names):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    threshold = 5  # Precision for precision calculation
    correct_predictions = [abs(pred - true) <= threshold for pred, true in zip(y_pred, y_test)]
    precision_percentage = (sum(correct_predictions) / len(correct_predictions)) * 100

    model_metrics['MAE'].append(mae)
    model_metrics['MSE'].append(mse)
    model_metrics['RMSE'].append(rmse)
    model_metrics['R2'].append(r2)
    model_metrics['Precision %'].append(precision_percentage)

    print(f"Model: {name}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R2) Score: {r2}")
    print(f"Precision Percentage: {precision_percentage:.2f}%")
    print("-------------------")


Model: Random Forest
Mean Absolute Error (MAE): 8.225982158151977
Mean Squared Error (MSE): 244.70366458623707
Root Mean Squared Error (RMSE): 15.643006890819843
R-squared (R2) Score: 0.25130166294986844
Precision Percentage: 46.05%
-------------------
Model: Bagging
Mean Absolute Error (MAE): 8.860695909125605
Mean Squared Error (MSE): 273.9641803969284
Root Mean Squared Error (RMSE): 16.55186335120395
R-squared (R2) Score: 0.16177582946578017
Precision Percentage: 42.99%
-------------------
Model: Gradient Boosting
Mean Absolute Error (MAE): 7.998714769550899
Mean Squared Error (MSE): 245.36389728735955
Root Mean Squared Error (RMSE): 15.664095801780565
R-squared (R2) Score: 0.2492816068700694
Precision Percentage: 48.41%
-------------------
Model: Lasso
Mean Absolute Error (MAE): 8.844067673484197
Mean Squared Error (MSE): 259.1862439403083
Root Mean Squared Error (RMSE): 16.099262217266613
R-squared (R2) Score: 0.2069905853167484
Precision Percentage: 43.11%
-------------------
Mod

In [39]:
import matplotlib.pyplot as plt
from tabulate import tabulate

df = pd.DataFrame(model_metrics)

df.set_index('Model', inplace=True)

table = tabulate(df, headers='keys', tablefmt='simple')

print(table)

plt.show()

Model                  MAE      MSE     RMSE        R2    Precision %
-----------------  -------  -------  -------  --------  -------------
Random Forest      8.22598  244.704  15.643   0.251302        46.0542
Bagging            8.8607   273.964  16.5519  0.161776        42.9918
Gradient Boosting  7.99871  245.364  15.6641  0.249282        48.4099
Lasso              8.84407  259.186  16.0993  0.206991        43.1095
KNN                9.36769  272.088  16.4951  0.167515        42.5206
