In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import joblib

# Load the cleaned dataset
df = pd.read_csv("cleaned_dataset.csv")

# Define features (X) and target variable (y)
x = df[["departure_station", "arrival_station", "month", "year"]]
y = df["average_delay_of_all_trains_at_arrival"]

# Specify categorical columns for encoding
categorical = ["departure_station", "arrival_station"]

# Create a preprocessor to handle categorical and numerical data
preprocessor = ColumnTransformer([
    # One-hot encode the categorical features
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    # Impute missing values in numerical features with the mean
    ("num", SimpleImputer(strategy="mean"), ["month", "year"])
], remainder="passthrough")  # Keep any remaining columns as they are

# Build a pipeline that first preprocesses the data and then applies the model
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))  # Random forest with 100 trees
])

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train the pipeline on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, pred)  # Mean Squared Error
rmse = mse ** 0.5                      # Root Mean Squared Error
print("RMSE:", rmse)
print("R²:", r2_score(y_test, pred))   # R-squared score

# Save the trained model to a file
joblib.dump(model, "delay_predictor.pkl")

RMSE: 13.88096960373666
R²: -0.0969490620622413


['delay_predictor.pkl']