In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib


In [2]:
df = pd.read_csv("../data/raw/synthetic_transport_data.csv")


In [3]:
features = [
    "hour",
    "day_of_week",
    "is_weekend",
    "route_id",
    "stop_sequence",
    "distance_to_next_stop_km",
    "traffic_level",
    "rain_flag"
]

X_eta = df[features]
y_eta = df["actual_travel_time_min"]


In [4]:
le = LabelEncoder()
df["crowding_encoded"] = le.fit_transform(df["crowding_level"])

X_crowd = df[features + ["passenger_count", "bus_capacity"]]
y_crowd = df["crowding_encoded"]


In [5]:
X_eta_train, X_eta_test, y_eta_train, y_eta_test = train_test_split(
    X_eta, y_eta, test_size=0.2, random_state=42
)

X_crowd_train, X_crowd_test, y_crowd_train, y_crowd_test = train_test_split(
    X_crowd, y_crowd, test_size=0.2, random_state=42
)


In [6]:
rf_eta = RandomForestRegressor(n_estimators=100, random_state=42)
rf_eta.fit(X_eta_train, y_eta_train)

eta_preds = rf_eta.predict(X_eta_test)
eta_mae = mean_absolute_error(y_eta_test, eta_preds)

print(f"Baseline ETA MAE: {eta_mae:.2f} minutes")


Baseline ETA MAE: 0.43 minutes


In [7]:
rf_crowd = RandomForestClassifier(n_estimators=100, random_state=42)
rf_crowd.fit(X_crowd_train, y_crowd_train)

crowd_preds = rf_crowd.predict(X_crowd_test)
crowd_acc = accuracy_score(y_crowd_test, crowd_preds)

print(f"Crowding Accuracy: {crowd_acc * 100:.2f}%")


Crowding Accuracy: 100.00%


In [8]:
joblib.dump(rf_eta, "../models/eta_model.pkl")
joblib.dump(rf_crowd, "../models/crowding_model.pkl")
joblib.dump(le, "../models/crowding_encoder.pkl")


['../models/crowding_encoder.pkl']