In [1]:
import pandas as pd
pd.read_csv("../data/raw/train.csv", nrows=1_000_000)


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.964630,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.010040,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.782520,N,435
...,...,...,...,...,...,...,...,...,...,...,...
999995,id1514670,1,2016-02-07 10:09:55,2016-02-07 10:22:06,1,-73.978455,40.737133,-73.983627,40.693794,N,731
999996,id1598272,1,2016-03-01 09:58:24,2016-03-01 10:11:24,1,-73.959099,40.799408,-73.984138,40.770058,N,780
999997,id2854347,2,2016-01-13 22:36:51,2016-01-13 22:53:26,1,-74.003639,40.729366,-73.963127,40.774921,N,995
999998,id3045535,1,2016-03-24 22:35:54,2016-03-24 22:44:16,1,-73.989449,40.723061,-74.001213,40.717636,N,502


In [2]:
import os, time, json, math, random
import numpy as np

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

SEED = 42
MAX_ROWS = 1_000_000

TARGET = "trip_duration"

DATA_PATH = "../data/raw/train.csv"  # notebook in notebooks/
ARTIFACTS_DIR = "../artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)


In [3]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

In [4]:
df = pd.read_csv(DATA_PATH, nrows=MAX_ROWS)
df.shape


(1000000, 11)

In [5]:
# 50% holdout for final test (don't touch until final evaluation)
dev_df, test_df = train_test_split(df, test_size=0.50, random_state=SEED)
# Split dev into training (2/3) and validation (1/3)
train_df, val_df = train_test_split(dev_df, test_size=1/3, random_state=SEED)

print(f"Train: {train_df.shape[0]:,}")
print(f"Val:   {val_df.shape[0]:,}")
print(f"Test:  {test_df.shape[0]:,} (holdout - do not use until final eval)")


((700000, 11), (150000, 11), (150000, 11))

In [6]:
def haversine_km(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * 6371.0 * np.arcsin(np.sqrt(a))

def build_features(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    dt = pd.to_datetime(df["pickup_datetime"])

    # temporal
    df["pickup_hour"]  = dt.dt.hour
    df["pickup_dow"]   = dt.dt.dayofweek
    df["pickup_month"] = dt.dt.month
    df["is_weekend"]   = (df["pickup_dow"] >= 5).astype(int)
    df["is_rush_hour"] = df["pickup_hour"].isin([7,8,9,16,17,18]).astype(int)

    # distance
    df["haversine_km"] = haversine_km(
        df["pickup_latitude"], df["pickup_longitude"],
        df["dropoff_latitude"], df["dropoff_longitude"]
    )

    # context/proxies
    df["store_and_fwd_flag_Y"] = (df["store_and_fwd_flag"] == "Y").astype(int)

    # one-hot vendor_id (treat as categorical)
    vendor_oh = pd.get_dummies(df["vendor_id"].astype(str), prefix="vendor", drop_first=False)

    base = df[[
        "passenger_count",
        "pickup_hour","pickup_dow","pickup_month",
        "is_weekend","is_rush_hour",
        "haversine_km",
        "store_and_fwd_flag_Y"
    ]]
    X = pd.concat([base, vendor_oh], axis=1)
    return X


In [7]:
X_train = build_features(train_df)
X_val   = build_features(val_df)
X_test  = build_features(test_df)

y_train = train_df[TARGET].values.astype(np.float64)
y_val   = val_df[TARGET].values.astype(np.float64)
y_test  = test_df[TARGET].values.astype(np.float64)

X_train.shape, X_val.shape, X_test.shape


((700000, 10), (150000, 10), (150000, 10))

In [8]:
dcap = np.quantile(X_train["haversine_km"], 0.999)
print("distance cap (km):", float(dcap))
for X in [X_train, X_val, X_test]:
    X["haversine_km"] = np.minimum(X["haversine_km"].values, dcap)


distance cap (km): 24.937536315636464


In [9]:
ycap = np.quantile(y_train, 0.999)
print("target cap (sec):", float(ycap))

y_train_log = np.log1p(np.minimum(y_train, ycap))
y_val_log   = np.log1p(np.minimum(y_val, ycap))
y_test_log  = np.log1p(np.minimum(y_test, ycap))

print("y_train_log min/max:", float(y_train_log.min()), float(y_train_log.max()))


target cap (sec): 85074.00100000005
y_train_log min/max: 0.6931471805599453 11.35128851112344


In [10]:
X_train_np = X_train.values.astype(np.float64)
X_val_np   = X_val.values.astype(np.float64)
X_test_np  = X_test.values.astype(np.float64)

mu = X_train_np.mean(axis=0)
sigma = X_train_np.std(axis=0)
sigma[sigma == 0] = 1.0

X_train_s = ((X_train_np - mu) / sigma).astype(np.float32)
X_val_s   = ((X_val_np   - mu) / sigma).astype(np.float32)
X_test_s  = ((X_test_np  - mu) / sigma).astype(np.float32)

print("Shapes:", X_train_s.shape, X_val_s.shape, X_test_s.shape)


Shapes: (700000, 10) (150000, 10) (150000, 10)


In [11]:
def rmse(y, yhat):
    y = np.asarray(y).reshape(-1)
    yhat = np.asarray(yhat).reshape(-1)
    return float(np.sqrt(np.mean((y - yhat)**2)))

def mae(y, yhat):
    y = np.asarray(y).reshape(-1)
    yhat = np.asarray(yhat).reshape(-1)
    return float(np.mean(np.abs(y - yhat)))

def mape(y, yhat, eps=1.0):
    y = np.asarray(y).reshape(-1)
    yhat = np.asarray(yhat).reshape(-1)
    denom = np.maximum(np.abs(y), eps)
    return float(np.mean(np.abs(y - yhat) / denom))

def evaluate_seconds(y_sec, yhat_sec):
    return {
        "rmse": rmse(y_sec, yhat_sec),
        "mae": mae(y_sec, yhat_sec),
        "r2": float(r2_score(y_sec, yhat_sec)),
        "mape": mape(y_sec, yhat_sec, eps=1.0),
    }

def evaluate_log(y_log, yhat_log):
    return {
        "rmse_log": rmse(y_log, yhat_log),
        "mae_log": mae(y_log, yhat_log),
        "r2_log": float(r2_score(y_log, yhat_log)),
    }

def safe_expm1(yhat_log, clip_min=-2.0, clip_max=15.5):
    yhat_log = np.asarray(yhat_log).reshape(-1)
    yhat_log = np.clip(yhat_log, clip_min, clip_max)
    return np.expm1(yhat_log)


In [12]:
ridge = Ridge(alpha=1.0, random_state=SEED)
ridge.fit(X_train_s, y_train_log)
yhat_val_log = ridge.predict(X_val_s)

print("baseline log metrics:", evaluate_log(y_val_log, yhat_val_log))

yhat_val_sec = safe_expm1(yhat_val_log)
print("baseline seconds metrics:", evaluate_seconds(y_val, yhat_val_sec))


baseline log metrics: {'rmse_log': 0.6112620288860431, 'mae_log': 0.44563203766075854, 'r2_log': 0.4068655825887888}
baseline seconds metrics: {'rmse': 5905.979522690742, 'mae': 496.13852770589193, 'r2': 0.0006621689913735018, 'mape': 0.680442018083154}


In [13]:
baseline_results = {
    "model": "Ridge(alpha=1.0) on log1p(target)",
    "val_log": evaluate_log(y_val_log, yhat_val_log),
    "val_seconds": evaluate_seconds(y_val, yhat_val_sec),
}
with open(os.path.join(ARTIFACTS_DIR, "phase1_baseline.json"), "w") as f:
    json.dump(baseline_results, f, indent=2)


In [14]:
class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(np.asarray(X), dtype=torch.float32)
        self.y = torch.tensor(np.asarray(y), dtype=torch.float32).view(-1, 1)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

batch_size = 4096
train_loader = DataLoader(TabularDataset(X_train_s, y_train_log), batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(TabularDataset(X_val_s,   y_val_log),   batch_size=batch_size, shuffle=False)


In [15]:
class MLP(nn.Module):
    def __init__(self, d_in, hidden=(256,128,64), dropout=0.1):
        super().__init__()
        layers = []
        prev = d_in
        for h in hidden:
            layers += [nn.Linear(prev, h), nn.ReLU(), nn.Dropout(dropout)]
            prev = h
        layers += [nn.Linear(prev, 1)]
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


In [19]:
def predict_torch(model, X_np):
    model.eval()
    loader = DataLoader(TabularDataset(X_np, np.zeros(len(X_np))), batch_size=4096, shuffle=False)
    preds = []
    with torch.no_grad():
        for xb, _ in loader:
            xb = xb.to(device)
            preds.append(model(xb).cpu().numpy())
    return np.vstack(preds).reshape(-1)

def train_mlp_fast(train_loader, X_val_np, y_val_log_np,
                   hidden=(256,128,64), dropout=0.1, lr=1e-3, weight_decay=1e-4,
                   epochs=15, patience=2):
    model = MLP(train_loader.dataset.X.shape[1], hidden=hidden, dropout=dropout).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.MSELoss()

    best_rmse = float("inf")
    best_state = None
    bad = 0
    history = []

    for ep in range(1, epochs+1):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            opt.zero_grad()
            loss.backward()
            opt.step()

        # cheap val pass on subset
        yhat_val_log = predict_torch(model, X_val_np)
        met = evaluate_log(y_val_log_np, yhat_val_log)
        history.append({"epoch": ep, **met})

        if met["rmse_log"] < best_rmse:
            best_rmse = met["rmse_log"]
            best_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model, history



In [20]:
tune_train_n = 150_000
tune_val_n   = 30_000

rng = np.random.default_rng(SEED)
train_sub_idx = rng.choice(len(X_train_s), size=tune_train_n, replace=False)
val_sub_idx   = rng.choice(len(X_val_s),   size=tune_val_n,   replace=False)

X_train_t = X_train_s[train_sub_idx]
y_train_t = y_train_log[train_sub_idx]

X_val_t = X_val_s[val_sub_idx]
y_val_t = y_val_log[val_sub_idx]

tune_train_loader = DataLoader(TabularDataset(X_train_t, y_train_t), batch_size=4096, shuffle=True)


In [21]:
search_space = []
for hidden in [(128,64), (256,128), (256,128,64)]:
    for lr in [1e-3, 5e-4]:
        for dropout in [0.0, 0.1]:
            search_space.append((hidden, lr, dropout))

trials = search_space[:12]  # fixed tuning budget = 12 trials

all_trials = []
best_trial = None
best_params = None

t0 = time.time()
for i, (hidden, lr, dropout) in enumerate(trials, 1):
    seed_everything(SEED)
    model, hist = train_mlp_fast(
        tune_train_loader, X_val_t, y_val_t,
        hidden=hidden, dropout=dropout, lr=lr, weight_decay=1e-4,
        epochs=15, patience=2
    )
    best_ep = min(hist, key=lambda r: r["rmse_log"])
    row = {"trial": i, "hidden": hidden, "lr": lr, "dropout": dropout, "weight_decay": 1e-4, **best_ep}
    all_trials.append(row)
    print(row)

    if best_trial is None or row["rmse_log"] < best_trial["rmse_log"]:
        best_trial = row
        best_params = {"hidden": hidden, "lr": lr, "dropout": dropout, "weight_decay": 1e-4}

print("Tuning time (sec):", time.time() - t0)
best_trial


{'trial': 1, 'hidden': (128, 64), 'lr': 0.001, 'dropout': 0.0, 'weight_decay': 0.0001, 'epoch': 15, 'rmse_log': 0.5652984917510551, 'mae_log': 0.4051719071502529, 'r2_log': 0.5005283598237229}
{'trial': 2, 'hidden': (128, 64), 'lr': 0.001, 'dropout': 0.1, 'weight_decay': 0.0001, 'epoch': 15, 'rmse_log': 0.5975051755520981, 'mae_log': 0.4344461584442662, 'r2_log': 0.44199444342102}
{'trial': 3, 'hidden': (128, 64), 'lr': 0.0005, 'dropout': 0.0, 'weight_decay': 0.0001, 'epoch': 15, 'rmse_log': 0.6071361319515244, 'mae_log': 0.44283188586646194, 'r2_log': 0.42386091369253576}
{'trial': 4, 'hidden': (128, 64), 'lr': 0.0005, 'dropout': 0.1, 'weight_decay': 0.0001, 'epoch': 15, 'rmse_log': 0.7232411600127382, 'mae_log': 0.5459344114049631, 'r2_log': 0.1824365853630443}
{'trial': 5, 'hidden': (256, 128), 'lr': 0.001, 'dropout': 0.0, 'weight_decay': 0.0001, 'epoch': 15, 'rmse_log': 0.517238687882326, 'mae_log': 0.3601519906850233, 'r2_log': 0.5818451066610444}
{'trial': 6, 'hidden': (256, 128)

{'trial': 9,
 'hidden': (256, 128, 64),
 'lr': 0.001,
 'dropout': 0.0,
 'weight_decay': 0.0001,
 'epoch': 15,
 'rmse_log': 0.503798494198854,
 'mae_log': 0.3494053698876185,
 'r2_log': 0.6032938704089186}

In [22]:
pd.DataFrame(all_trials).to_csv(os.path.join(ARTIFACTS_DIR, "phase1_nn_tuning_fast.csv"), index=False)
print("Saved:", os.path.join(ARTIFACTS_DIR, "phase1_nn_tuning_fast.csv"))


Saved: ../artifacts/phase1_nn_tuning_fast.csv


In [23]:
seed_everything(SEED)

best_nn, final_hist = train_mlp_fast(
    train_loader, X_val_s, y_val_log,
    **best_params,
    epochs=30, patience=4
)

print("Best params:", best_params)
print("Best val epoch:", min(final_hist, key=lambda r: r["rmse_log"]))


Best params: {'hidden': (256, 128, 64), 'lr': 0.001, 'dropout': 0.0, 'weight_decay': 0.0001}
Best val epoch: {'epoch': 20, 'rmse_log': 0.46738611429825283, 'mae_log': 0.3193730283431698, 'r2_log': 0.6532233079968266}


In [24]:
yhat_test_log = predict_torch(best_nn, X_test_s)

nn_test_log = evaluate_log(y_test_log, yhat_test_log)
yhat_test_sec = safe_expm1(yhat_test_log)
nn_test_sec = evaluate_seconds(y_test, yhat_test_sec)

nn_results = {
    "best_trial_subset": best_trial,
    "best_params": best_params,
    "test_log": nn_test_log,
    "test_seconds": nn_test_sec,
}
nn_results


{'best_trial_subset': {'trial': 9,
  'hidden': (256, 128, 64),
  'lr': 0.001,
  'dropout': 0.0,
  'weight_decay': 0.0001,
  'epoch': 15,
  'rmse_log': 0.503798494198854,
  'mae_log': 0.3494053698876185,
  'r2_log': 0.6032938704089186},
 'best_params': {'hidden': (256, 128, 64),
  'lr': 0.001,
  'dropout': 0.0,
  'weight_decay': 0.0001},
 'test_log': {'rmse_log': 0.4725317654069119,
  'mae_log': 0.32128089691306994,
  'r2_log': 0.6468673247985999},
 'test_seconds': {'rmse': 3179.9919037262616,
  'mae': 348.60575570164997,
  'r2': 0.02848620113102296,
  'mape': 0.36165966574396136}}

In [25]:
with open(os.path.join(ARTIFACTS_DIR, "phase1_nn_results.json"), "w") as f:
    json.dump(nn_results, f, indent=2)
print("Saved:", os.path.join(ARTIFACTS_DIR, "phase1_nn_results.json"))


Saved: ../artifacts/phase1_nn_results.json


In [26]:
# Baseline test (one-time)
yhat_test_log_base = ridge.predict(X_test_s)
base_test_sec = evaluate_seconds(y_test, safe_expm1(yhat_test_log_base))

# NN test (already computed as nn_test_sec)
compare = pd.DataFrame([
    {"model": "Baseline_Ridge", **base_test_sec},
    {"model": "MLP_NN", **nn_test_sec},
])
compare


Unnamed: 0,model,rmse,mae,r2,mape
0,Baseline_Ridge,3238.718724,488.986874,-0.007728,0.676567
1,MLP_NN,3179.991904,348.605756,0.028486,0.36166


In [27]:
compare.to_csv(os.path.join(ARTIFACTS_DIR, "phase1_compare_table.csv"), index=False)
print("Saved:", os.path.join(ARTIFACTS_DIR, "phase1_compare_table.csv"))


Saved: ../artifacts/phase1_compare_table.csv
