# Notebook: Single‐Pick Baseline Route‐Choice Heuristics

Implement four simple, single‐pick heuristics as lower‐bound benchmarks.  Each journey (`Obs_ID`) gets exactly **one** predicted route:

- **Baseline A**: Fastest travel‐time (random tie‐break)  
- **Baseline B**: Fastest, then fewest transfers (random tie‐break)  
- **Baseline C**: Fewest transfers only (random tie‐break)  
- **Baseline D**: Fewest transfers, then fastest time (random tie‐break)

We’ll compute **choice‐set accuracy** for each to compare fairly against our Random Forest.


In [None]:
import pandas as pd
import numpy as np

# Load the feature table (must include 'Obs_ID','TT_total','transfers','choice')
df = pd.read_parquet('features_final.parquet')

# Show key columns
print("Columns:", df.columns.tolist())
print(df[['Obs_ID','TT_total','transfers','choice']].head())


# Baseline A – Fastest‐Only



In [None]:
def choice_set_accuracy(df, pred_col):
    """
    Computes the fraction of journeys where the single predicted
    alternative (pred_col == 1) matches the true choice.
    """
    hits = []
    for oid, group in df.groupby('Obs_ID'):
        true_idx = group.index[group['choice']==1][0]
        hits.append(bool(group.loc[true_idx, pred_col]))
    return sum(hits) / len(hits)


In [None]:
# Seed for reproducible random tie‐breaks
RANDOM_STATE = 42


In [None]:
def pick_fastest(group):
    fastest = group[group['TT_total'] == group['TT_total'].min()]
    return fastest.sample(n=1, random_state=RANDOM_STATE).index[0]

# Apply per journey
best_fastest_idx = df.groupby('Obs_ID').apply(pick_fastest)
df['pred_fastest'] = 0
df.loc[best_fastest_idx, 'pred_fastest'] = 1

# Evaluate
acc_fastest = choice_set_accuracy(df, 'pred_fastest')
print(f"Baseline A (Fastest‐only): {acc_fastest:.3%}")


In [None]:
def pick_fastest_then_fewest(group):
    fastest = group[group['TT_total'] == group['TT_total'].min()]
    fewest = fastest[fastest['transfers'] == fastest['transfers'].min()]
    return fewest.sample(n=1, random_state=RANDOM_STATE).index[0]

best_fastest_trans_idx = df.groupby('Obs_ID').apply(pick_fastest_then_fewest)
df['pred_fastest_trans'] = 0
df.loc[best_fastest_trans_idx, 'pred_fastest_trans'] = 1

acc_fastest_trans = choice_set_accuracy(df, 'pred_fastest_trans')
print(f"Baseline B (Fastest → Fewest transfers): {acc_fastest_trans:.3%}")


In [None]:
def pick_fewest_transfers(group):
    fewest = group[group['transfers'] == group['transfers'].min()]
    return fewest.sample(n=1, random_state=RANDOM_STATE).index[0]

best_fewest_idx = df.groupby('Obs_ID').apply(pick_fewest_transfers)
df['pred_fewest_trans'] = 0
df.loc[best_fewest_idx, 'pred_fewest_trans'] = 1

acc_fewest_trans = choice_set_accuracy(df, 'pred_fewest_trans')
print(f"Baseline C (Fewest transfers only): {acc_fewest_trans:.3%}")


In [None]:
def pick_fewest_then_fastest(group):
    fewest = group[group['transfers'] == group['transfers'].min()]
    fastest = fewest[fewest['TT_total'] == fewest['TT_total'].min()]
    return fastest.sample(n=1, random_state=RANDOM_STATE).index[0]

best_fewest_then_fastest_idx = df.groupby('Obs_ID').apply(pick_fewest_then_fastest)
df['pred_fewest_then_fastest'] = 0
df.loc[best_fewest_then_fastest_idx, 'pred_fewest_then_fastest'] = 1

acc_fewest_then_fastest = choice_set_accuracy(df, 'pred_fewest_then_fastest')
print(f"Baseline D (Fewest transfers → Shortest time): {acc_fewest_then_fastest:.3%}")


In [None]:
# CSummary of Baseline Performance

print("Baseline Choice‐Set Accuracies")
print(f"A: Fastest‐only                     {acc_fastest:.2%}")
print(f"B: Fastest → Fewest transfers      {acc_fastest_trans:.2%}")
print(f"C: Fewest transfers only           {acc_fewest_trans:.2%}")
print(f"D: Fewest transfers → Shortest TT  {acc_fewest_then_fastest:.2%}")
