In [None]:
# Imports & Paths
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

CHOICE_SET_PATH = r'C:\Users\Chris\Desktop\DTU\8. Semester\Bachelorprojekt-1\Data\Final\choice_set_Final_1_15_Sep23.csv'
STOPS_PATH      = r'C:\Users\Chris\Desktop\DTU\8. Semester\Bachelorprojekt-1\Data\GTFS_20230925\stops.txt'


In [None]:
# Load trips & mark M3 availability 
trips = pd.read_csv(CHOICE_SET_PATH)

# Flag every (OD, route) row that contains an M3 leg
trips['has_M3'] = trips['RuteIdSeq'].str.contains(r'\bM3\b', regex=True)

# Identify ODs where at least one alternative uses M3
ods_with_m3 = trips.loc[trips['has_M3'], 'OD'].unique()


In [None]:
# Compute M3 share per OD
chosen = trips[trips['choice'] == 1]

agg = (
    chosen
    .groupby('OD')['has_M3']
    .agg(total_trips='size', m3_trips='sum')
)
agg['share_m3'] = agg['m3_trips'] / agg['total_trips']

# Keep only ODs with an M3 option and more than 80 trips
agg = agg.loc[agg.index.isin(ods_with_m3) & (agg['total_trips'] >= 80)]


In [None]:
# Split into bands & visualize
bins   = [-0.01, 0.10, 0.50, 0.90, 1.01]
labels = ['<10%', '10–50%', '50–90%', '>90%']
agg['band'] = pd.cut(agg['share_m3'], bins=bins, labels=labels)

# Print counts of each band
counts = agg['band'].value_counts().sort_index()
print("ODs per M3-share band:\n", counts)



In [None]:
# Sampling (3 ODs per band)
np.random.seed(42)
sampled = (
    agg
    .groupby('band', group_keys=False)
    .apply(lambda df: df.sample(n=3) if len(df) >= 3 else df)
    .reset_index()
)


In [None]:
# Map to station names & display final table
stops = (
    pd.read_csv(STOPS_PATH, dtype={'stop_id': str})
      .assign(stop_id=lambda d: d['stop_id'].str.zfill(12))
      .set_index('stop_id')['stop_name']
      .to_dict()
)

# Split OD into origin/destination codes
sampled[['orig_id','dest_id']] = sampled['OD'].str.split('-', expand=True)
for c in ['orig_id','dest_id']:
    sampled[c] = sampled[c].str.zfill(12)

# Map codes to names (fallback to code if missing)
sampled['orig_name'] = sampled['orig_id'].map(stops).fillna(sampled['orig_id'])
sampled['dest_name'] = sampled['dest_id'].map(stops).fillna(sampled['dest_id'])

final_cols = ['OD','orig_name','dest_name','total_trips','m3_trips','share_m3','band']
print("Stratified sample of viable ODs:\n")
print(sampled[final_cols].to_string(index=False))
