# Tardis Model Prediction

## Defining the import

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV

## Load the cleaned data

In [None]:
df = pd.read_csv("cleaned_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Service,Departure station,Arrival station,Average journey time,Number of scheduled trains,Number of cancelled trains,Number of trains delayed at departure,Average delay of late trains at departure,...,Number of trains delayed > 30min,Number of trains delayed > 60min,Pct delay due to external causes,Pct delay due to infrastructure,Pct delay due to traffic management,Pct delay due to rolling stock,Pct delay due to station management and equipment reuse,"Pct delay due to passenger handling (crowding, disabled persons, connections)",Average delay late departure in Hour,Average delay departure in Hour
0,0,2018-01,National,Bordeaux-St-Jean,Paris-Montparnasse,141.0,,5.0,289.0,11.247809,...,44.0,8.0,36.134454,31.092437,10.92437,15.966387,5.042017,75.91573,00:11,00:03
1,1,2018-01,National,La Rochelle-Ville,Paris-Montparnasse,165.0,222.0,,8.0,2.875,...,5.0,,15.384615,30.769231,38.461538,11.538462,3.846154,0.0,00:02,00:00
2,2,2018-01,National,Paris-Montparnasse,Quimper,220.0,248.0,1.0,37.0,9.501351,...,17.0,7.0,26.923077,38.461538,,19.230769,0.0,0.0,00:09,
3,3,2018-01,National,Paris-Montparnasse,St-Malo,156.0,102.0,0.0,12.0,19.9125,...,6.0,4.0,23.076923,218.650888,7.692308,15.384615,7.692308,,00:19,00:01
4,4,2018-01,National,Paris-Montparnasse,St-Pierre-des-Corps,61.0,391.0,2.0,61.0,,...,6.0,0.0,21.212121,42.424242,9.090909,21.212121,6.060606,0.0,,00:00


## Selecting relevant features for the model

#### The model used is Random forest will use the departure and arrival station with schedulded time of arrival and the day if the week as features.

#### Passing name into integers for easier learning

In [None]:
labels_cols = ["Departure station", "Arrival station"]
for col in labels_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

##### Defining features and the target

In [None]:
df = df.dropna(subset=["Average delay of all trains at departure"])

df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m", errors="coerce")
df = df.dropna(subset=["Date"])

df["day_of_week"] = df["Date"].dt.day_name()

X = df[[
    "Departure station",
    "Arrival station",
    "Average journey time",
    "Number of scheduled trains",
    "day_of_week",
    "Average delay departure in Hour",
    "Average delay late departure in Hour",
]]

y = df["Average delay of all trains at departure"]

### Spliting dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Training the model

### Random Forest training

In [None]:
categorical_features = ["Departure station", "Arrival station", "day_of_week"]
numerical_features = [
    "Average journey time",
    "Number of scheduled trains",
    "Average delay departure in Hour",
    "Average delay late departure in Hour"
]

X = df[categorical_features + numerical_features].copy()
y = df["Average delay of all trains at departure"].copy()

def time_to_minutes(t):
    if isinstance(t, str) and ":" in t:
        h, m = map(int, t.split(":"))
        return h * 60 + m
    return np.nan

X.loc[:, "Average delay departure in Hour"] = X["Average delay departure in Hour"].apply(time_to_minutes)
X.loc[:, "Average delay late departure in Hour"] = X["Average delay late departure in Hour"].apply(time_to_minutes)

valid_idx = X.dropna().index.intersection(y.dropna().index)
X = X.loc[valid_idx]
y = y.loc[valid_idx]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ],
    remainder="passthrough"
)

# Use your best hyperparameters found
best_params = {
    "n_estimators": 200,
    "max_depth": None,
    "min_samples_split": 5,
    "min_samples_leaf": 1,
    "max_features": "sqrt",
    "bootstrap": True,
    "random_state": 42
}

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(**best_params))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Random Forest R²:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

Random Forest R²: 0.940873616251037
RMSE: 1.7633987691573132


#### Evaluation of the model

In [None]:
# Assuming y_test and y_pred_rf are already defined

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Random Forest R²: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

# Correlation with target variable, sorted descending
corr_target = df.corr(numeric_only=True)["Average delay of all trains at departure"].sort_values(ascending=False)
print("\nCorrelation with Average Delay of All Trains at Departure:")
print(corr_target)

Random Forest R²: 0.9409
RMSE: 1.7634

Correlation with Average Delay of All Trains at Departure:
Average delay of all trains at departure                                         1.000000
Average delay of late trains at departure                                        0.116762
Average delay of all trains at arrival                                           0.084140
Number of trains delayed at departure                                            0.059871
Number of trains delayed > 60min                                                 0.042031
Number of trains delayed > 30min                                                 0.042029
Arrival station                                                                  0.039493
Number of trains delayed > 15min                                                 0.039463
Number of trains delayed at arrival                                              0.026829
Average journey time                                                             0.018529
Av

## Saving the model

In [None]:
joblib.dump(model, "model.pkl")

['model.pkl']