# ETA Baseline Model â€” Santiago Electric Bus Tracker

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
DATA = Path("../data/sample_buses.csv")
df = pd.read_csv(DATA)
df.head()

In [None]:
import math
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    dphi = math.radians(lat2 - lat1)
    dlambda = math.radians(lon2 - lon1)
    a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    return 2*R*math.asin(math.sqrt(a))

df["haversine_km"] = df.apply(lambda r: haversine(r.latitude, r.longitude, r.next_stop_lat, r.next_stop_lon), axis=1)
df[["bus_id","route","haversine_km","speed_kph","traffic_index","eta_min"]].head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import pandas as pd, numpy as np

X = df[["haversine_km","speed_kph","traffic_index"]].copy()
X = pd.concat([X, pd.get_dummies(df["route"], prefix="route")], axis=1)
y = df["eta_min"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model = LinearRegression().fit(X_train, y_train)
pred = model.predict(X_test)
mae = mean_absolute_error(y_test, pred)
mape = (np.abs((y_test - pred) / y_test).mean()) * 100
print("MAE (min):", round(mae, 2))
print("MAPE (%):", round(mape, 2))

In [None]:
plt.figure()
plt.scatter(y_test, pred, alpha=0.7)
plt.xlabel("Actual ETA (min)"); plt.ylabel("Predicted ETA (min)")
plt.title("Predicted vs Actual ETA"); plt.tight_layout(); plt.show()

errors = y_test - pred
plt.figure()
plt.hist(errors, bins=15)
plt.xlabel("Error (min)"); plt.ylabel("Frequency")
plt.title("Error Distribution"); plt.tight_layout(); plt.show()