In [3]:
import pandas as pd
import os
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

script_dir = os.path.abspath('')
relative_path = '../dataset/regularite-mensuelle-tgv-aqst.csv'
csv_path = os.path.join(script_dir, relative_path)
data = pd.read_csv(csv_path, delimiter=";")

# Convert the 'date' column to a datetime object (if not already)
data['date'] = pd.to_datetime(data['date'])

# Split the data into a training set and a test set (last six months as test set)
data = data.sort_values(by='date')
train_data = data[data['date'] < data['date'].max() - pd.DateOffset(months=6)]
test_data = data[data['date'] >= data['date'].max() - pd.DateOffset(months=6)]

# Select features for training and testing
features = ['nb_train_retard_arrivee', 'prct_cause_externe', 'prct_cause_infra', 'prct_cause_gestion_trafic', 'prct_cause_materiel_roulant', 'prct_cause_gestion_gare', 'prct_cause_prise_en_charge_voyageurs']

X_train = train_data[features]
X_test = test_data[features]

y_train = train_data['retard_moyen_arrivee']
y_test = test_data['retard_moyen_arrivee']

# Train an SVM regression model
svm_model = SVR(kernel='linear')  # You can choose a different kernel if needed
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2) Score: {r2}")

Mean Squared Error (MSE): 354.15143398727434
R-squared (R2) Score: -0.08356607629285495


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

script_dir = os.path.abspath('')
relative_path = '../dataset/regularite-mensuelle-tgv-aqst.csv'
csv_path = os.path.join(script_dir, relative_path)
data = pd.read_csv(csv_path, delimiter=";")

# Convert the 'date' column to a datetime object
data['date'] = pd.to_datetime(data['date'])

# Split the data into a training set and a test set (last six months as test set)
data = data.sort_values(by='date')
train_data = data[data['date'] < data['date'].max() - pd.DateOffset(months=6)]
test_data = data[data['date'] >= data['date'].max() - pd.DateOffset(months=6)]

# The features on which we train the models
features = ['nb_train_retard_arrivee', 'prct_cause_externe', 'prct_cause_infra', 'prct_cause_gestion_trafic', 'prct_cause_materiel_roulant', 'prct_cause_gestion_gare', 'prct_cause_prise_en_charge_voyageurs']

X_train = train_data[features]
X_test = test_data[features]

y_train = train_data['retard_moyen_arrivee']
y_test = test_data['retard_moyen_arrivee']

# The models that we try
svm_model = SVR(kernel='linear')
rf_model = RandomForestRegressor()
bagging_model = BaggingRegressor()
dt_model = DecisionTreeRegressor()
models = [svm_model, rf_model, bagging_model, dt_model]
model_names = ['SVM', 'Random Forest', 'Bagging', 'Decision Tree']
model_metrics = {
    'Model': model_names,
    'MAE': [],
    'MSE': [],
    'RMSE': [],
    'R2': [],
    'Precision %': []
}

# Evaluate and compare the models
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    threshold = 5 # precision for precision calcul
    correct_predictions = [abs(pred - true) <= threshold for pred, true in zip(y_pred, y_test)]
    precision_percentage = (sum(correct_predictions) / len(correct_predictions)) * 100

    model_metrics['MAE'].append(mae)
    model_metrics['MSE'].append(mse)
    model_metrics['RMSE'].append(rmse)
    model_metrics['R2'].append(r2)
    model_metrics['Precision %'].append(precision_percentage)

    print(f"Model: {name}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R2) Score: {r2}")
    print(f"Precision Percentage: {precision_percentage:.2f}%")
    print("-------------------")


Model: SVM
Mean Absolute Error (MAE): 11.384179264754508
Mean Squared Error (MSE): 354.15143398727434
Root Mean Squared Error (RMSE): 18.818911604746816
R-squared (R2) Score: -0.08356607629285495
Precision Percentage: 33.22%
-------------------
Model: Random Forest
Mean Absolute Error (MAE): 10.784436481011696
Mean Squared Error (MSE): 320.9446901770438
Root Mean Squared Error (RMSE): 17.914929254034018
R-squared (R2) Score: 0.018033684837030717
Precision Percentage: 32.74%
-------------------
Model: Bagging
Mean Absolute Error (MAE): 11.24891573801107
Mean Squared Error (MSE): 335.5573749755851
Root Mean Squared Error (RMSE): 18.318225213584014
R-squared (R2) Score: -0.02667546501164275
Precision Percentage: 32.04%
-------------------
Model: Decision Tree
Mean Absolute Error (MAE): 13.99470156893508
Mean Squared Error (MSE): 438.82099688586857
Root Mean Squared Error (RMSE): 20.948054727966237
R-squared (R2) Score: -0.34262211065231907
Precision Percentage: 24.85%
-------------------


In [15]:
import matplotlib.pyplot as plt
from tabulate import tabulate

df = pd.DataFrame(model_metrics)

df.set_index('Model', inplace=True)

table = tabulate(df, headers='keys', tablefmt='simple')

print(table)

plt.show()


Model              MAE      MSE     RMSE          R2    Precision %
-------------  -------  -------  -------  ----------  -------------
SVM            11.3842  354.151  18.8189  -0.0835661        33.2155
Random Forest  10.7844  320.945  17.9149   0.0180337        32.7444
Bagging        11.2489  335.557  18.3182  -0.0266755        32.0377
Decision Tree  13.9947  438.821  20.9481  -0.342622         24.8528
