# Imports & Load Data



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss

# Load the final feature table
df = pd.read_parquet('features_final.parquet')
print("Loaded features:", df.shape)
df.head(2)


# Define Features, Labels & Groups



In [None]:
# All columns except metadata
feature_cols = [c for c in df.columns if c not in ['choice','OD','Obs_ID']]

X_full = df[feature_cols]
y_full = df['choice']
groups_full = df['Obs_ID']

print(f"Using {len(feature_cols)} features:", feature_cols)


# Section A: Pilot‐OD Random Forest

First we subset to our 12 “pilot” ODs and train/test a Random Forest there, to see how it performs on our chosen case study.


In [None]:
# 3a) Define the 12 pilot ODs
pilot_ods = [
    '8603305-8603307','8603313-8603308','8600741-8600642',
    '8600636-8603308','8600681-8603308','8600677-8603308',
    '8600657-8603308','8600695-8603308','8600678-8603308',
    '8603317-8603339','8603334-8603331','8603336-8603331'
]

pilot_df = df[df['OD'].isin(pilot_ods)].copy()
X_pilot = pilot_df[feature_cols]
y_pilot = pilot_df['choice']
groups_pilot = pilot_df['Obs_ID']

print("Pilot set shape:", pilot_df.shape)


In [None]:
# Train/test split on Obs_ID for pilot set
pilot_ids = groups_pilot.unique()
train_ids, test_ids = train_test_split(pilot_ids, test_size=0.2, random_state=42)

train_mask = pilot_df['Obs_ID'].isin(train_ids)
test_mask  = ~train_mask

X_train_p = X_pilot[train_mask]
X_test_p  = X_pilot[test_mask]
y_train_p = y_pilot[train_mask]
y_test_p  = y_pilot[test_mask]

# Fit Random Forest on pilot
rf_p = RandomForestClassifier(
    n_estimators=200,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)
rf_p.fit(X_train_p, y_train_p)


joblib.dump(rf_p, 'rf_pilot.joblib')
print("Saved pilot RF to rf_pilot.joblib")

# Evaluate pilot performance
y_pred_p  = rf_p.predict(X_test_p)
y_proba_p = rf_p.predict_proba(X_test_p)[:,1]

acc_p = accuracy_score(y_test_p, y_pred_p)
ll_p  = log_loss(y_test_p, y_proba_p)

# choice‐set accuracy: did the top‐scoring alt match the actual choice?
test_p = pilot_df[test_mask].copy()
test_p['score'] = y_proba_p
cs_p = test_p.groupby('Obs_ID') \
             .apply(lambda g: int(g.loc[g.score.idxmax(),'choice'])) \
             .mean()

print(f"Pilot RF — Accuracy: {acc_p:.3f}, Log-loss: {ll_p:.3f}, Choice-set acc: {cs_p:.3f}")


In [None]:
# Pilot: Top 10 Feature Importances
import numpy as np

imp_p = rf_p.feature_importances_
idx_p = np.argsort(imp_p)[::-1][:10]

plt.figure(figsize=(6,4))
plt.barh(np.array(feature_cols)[idx_p][::-1], imp_p[idx_p][::-1])
plt.title("Pilot: Top 10 Feature Importances")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()


# Section B: Full‐Network Random Forest

Now we repeat the same process on the entire dataset to see overall performance.


In [None]:
# Train/test split on Obs_ID for full network
full_ids = groups_full.unique()
train_ids, test_ids = train_test_split(full_ids, test_size=0.2, random_state=42)

train_mask = df['Obs_ID'].isin(train_ids)
test_mask  = ~train_mask

X_train_f = X_full[train_mask]
X_test_f  = X_full[test_mask]
y_train_f = y_full[train_mask]
y_test_f  = y_full[test_mask]

# Fit Random Forest on full network
rf_f = RandomForestClassifier(
    n_estimators=200,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)
rf_f.fit(X_train_f, y_train_f)

joblib.dump(rf_f, 'rf_full.joblib')
print("Saved full RF to rf_full.joblib")

# Evaluate full‐network performance
y_pred_f  = rf_f.predict(X_test_f)
y_proba_f = rf_f.predict_proba(X_test_f)[:,1]

acc_f = accuracy_score(y_test_f, y_pred_f)
ll_f  = log_loss(y_test_f, y_proba_f)

test_f = df[test_mask].copy()
test_f['score'] = y_proba_f
cs_f = test_f.groupby('Obs_ID') \
             .apply(lambda g: int(g.loc[g.score.idxmax(),'choice'])) \
             .mean()

print(f"Full RF — Accuracy: {acc_f:.3f}, Log-loss: {ll_f:.3f}, Choice-set acc: {cs_f:.3f}")


In [None]:
# Full: Top 10 Feature Importances
imp_f = rf_f.feature_importances_
idx_f = np.argsort(imp_f)[::-1][:10]

plt.figure(figsize=(6,4))
plt.barh(np.array(feature_cols)[idx_f][::-1], imp_f[idx_f][::-1])
plt.title("Full: Top 10 Feature Importances")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()


# Pilot vs Full Comparison



In [None]:
results = pd.DataFrame({
    'Setting': ['Pilot','Full network'],
    'Accuracy': [acc_p, acc_f],
    'Log-loss': [ll_p, ll_f],
    'Choice-set accuracy': [cs_p, cs_f]
})
print(results.to_string(index=False))

