In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv("../data/raw/train_delay.csv")

X = df.drop(columns=['delay_minutes', 'train_number'])
y = df['delay_minutes']

In [3]:
categorical_cols = ['train_type', 'source', 'destination', 'day_of_week']
numerical_cols = ['distance_km', 'month']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

lr_pipeline.fit(X_train, y_train)

y_pred_lr = lr_pipeline.predict(X_test)

rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

rmse_lr, r2_lr

(np.float64(15.437058412108176), 0.12648525133934396)

Linear Regression was used as a baseline model

In [7]:
from sklearn.tree import DecisionTreeRegressor

In [8]:
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])

dt_pipeline.fit(X_train, y_train)

y_pred_dt = dt_pipeline.predict(X_test)

rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
r2_dt = r2_score(y_test, y_pred_dt)

rmse_dt, r2_dt

(np.float64(14.286707108357755), 0.2518211452623831)

Decision Tree captures non-linear relationships

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=100,
        random_state=42
    ))
])

rf_pipeline.fit(X_train, y_train)

y_pred_rf = rf_pipeline.predict(X_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

rmse_rf, r2_rf

(np.float64(11.001121079235515), 0.5563759969883703)

In [11]:
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest'],
    'RMSE': [rmse_lr, rmse_dt, rmse_rf],
    'R2 Score': [r2_lr, r2_dt, r2_rf]
})

results

Unnamed: 0,Model,RMSE,R2 Score
0,Linear Regression,15.437058,0.126485
1,Decision Tree,14.286707,0.251821
2,Random Forest,11.001121,0.556376


Random Forest outperformed other models due to its ability to capture complex interactions

In [12]:
import pickle

with open("../Models/delay_prediction_model.pkl", "wb") as f:
    pickle.dump(rf_pipeline, f)