In [None]:
import pandas as pd
import numpy as np

# Paths
CHOICE_SET_CSV = r'C:\Users\Chris\Desktop\DTU\8. Semester\Bachelorprojekt-1\Data\Final\choice_set_Final_1_15_Sep23.csv'
CHOSEN_CSV     = r'C:\Users\Chris\Desktop\DTU\8. Semester\Bachelorprojekt-1\Data\Final\df_Sep1_15_SEP23_MORNING.csv'

# Load
df_choices = pd.read_csv(CHOICE_SET_CSV)
df_chosen  = pd.read_csv(CHOSEN_CSV, parse_dates=['Start_Time'])

# Merge on 'turngl' to get timestamp
df = (
    df_choices
    .merge(df_chosen[['turngl','Start_Time']], on='turngl', how='left')
)

df.head(3)


In [None]:
# Summed in‐vehicle time across modes
tt_cols = ['sum_TT_Bus','sum_TT_Metro','sum_TT_Tog','sum_TT_Stog']
df['in_vehicle_time'] = df[tt_cols].sum(axis=1)

# Total travel time including walking
df['TT_total'] = df['in_vehicle_time'] + df['WalkingTime']

# Transfers count (raw)
df['transfers'] = df['transfers_upd']

# Within each Obs_ID, find the fastest alternative
g = df.groupby('Obs_ID')
df['best_TT']   = g['TT_total'].transform('min')
df['excess_TT'] = df['TT_total'] - df['best_TT']


In [None]:
# Relative excess travel time
df['rel_TT']  = df['excess_TT'] / (g['TT_total'].transform('max') - df['best_TT'])
df['rank_TT'] = g['TT_total'].rank(method='min', ascending=True)

# Relative transfers
df['rel_transfers'] = (
    (df['transfers'] - g['transfers'].transform('min')) /
    (g['transfers'].transform('max') - g['transfers'].transform('min'))
)
df['rank_trans'] = g['transfers'].rank(method='min', ascending=True)

# Relative walking time
df['walking_time'] = df['WalkingTime']
df['rel_walk']     = (
    (df['WalkingTime'] - g['WalkingTime'].transform('min')) /
    (g['WalkingTime'].transform('max') - g['WalkingTime'].transform('min'))
)
df['rank_walk']    = g['WalkingTime'].rank(method='min', ascending=True)


In [None]:
df['min_dist'] = g['total_distance'].transform('min')
df['max_dist'] = g['total_distance'].transform('max')
df['rel_dist'] = (df['total_distance'] - df['min_dist']) / (df['max_dist'] - df['min_dist'])

# Flag if alternative is longer than median OD-distance
od_med = df.groupby('OD')['total_distance'].transform('median')
df['long_journey'] = (df['total_distance'] > od_med).astype(int)


In [None]:
# OD_demand = # chosen per OD
df['OD_demand'] = g['choice'].transform('sum')


In [None]:
m1 = {'000008603301', '000008603302', '000008603303', '000008603304', '000008603305', '000008603306', '000008603307', '000008603308', '000008603309', '000008603310', '000008603311', '000008603312', '000008603313', '000008603315', '000008603317'}

m2 = {'000008603301', '000008603302', '000008603303', '000008603304', '000008603305', '000008603306', '000008603307', '000008603308', '000008603309', '000008603321', '000008603322', '000008603323', '000008603324', '000008603326', '000008603327', '000008603328'}

m3 = {'8603330','8603331','8603332','8603308','8603333','8603334',
      '8603335','8603336','8603337','8603338','8603339','8603340',
      '8603341','8603342','8603305','8603343','8603344'}

m4 = {'000008603345', '000008603346', '000008603334', '000008603333', '000008603308', '000008603332', '000008603331', '000008603330'} # Resten af M4 var ikke åbnet i september 2023


df['orig_on_M1'] = df['origin'].astype(str).isin(m1).astype(int)
df['dest_on_M1'] = df['destination'].astype(str).isin(m1).astype(int)

df['orig_on_M2'] = df['origin'].astype(str).isin(m2).astype(int)
df['dest_on_M2'] = df['destination'].astype(str).isin(m2).astype(int)

df['orig_on_M3'] = df['origin'].astype(str).isin(m3).astype(int)
df['dest_on_M3'] = df['destination'].astype(str).isin(m3).astype(int)

df['orig_on_M4'] = df['origin'].astype(str).isin(m4).astype(int)
df['dest_on_M4'] = df['destination'].astype(str).isin(m4).astype(int)



In [None]:
df['hour']    = df['Start_Time'].dt.hour + df['Start_Time'].dt.minute/60
df['is_rush'] = ((df['Start_Time'].dt.weekday < 5) & df['hour'].between(7,9)).astype(int)


In [None]:
df['freq_per_hr'] = 60.0 / df['first_headway']

# Atomic mode flags
for mode in ["Bus","Metro","Tog","Stog"]:
    df[f"uses_{mode}"] = df["ModalKomb"].str.contains(mode).astype(int)

# Relative frequency within each Obs_ID
df['rel_freq'] = (
    (df['freq_per_hr'] - g['freq_per_hr'].transform('min')) /
    (g['freq_per_hr'].transform('max') - g['freq_per_hr'].transform('min'))
)


In [None]:
from collections import defaultdict
import csv

# Build trip to route map
TRIPS      = r'C:\Users\Chris\Desktop\DTU\8. Semester\Bachelorprojekt-1\Data\GTFS_20230925\trips.txt'
STOP_TIMES = r'C:\Users\Chris\Desktop\DTU\8. Semester\Bachelorprojekt-1\Data\GTFS_20230925\stop_times.txt'

# trip_id → route_id
trips = pd.read_csv(TRIPS, usecols=['route_id','trip_id'], dtype=str)
trip2route = dict(zip(trips['trip_id'], trips['route_id']))

# Read only the first stop of each trip
route_times = defaultdict(list)
with open(STOP_TIMES, newline='') as f:
    reader = csv.DictReader(f)
    for row in reader:
        if row['stop_sequence'] == '0':
            rid = trip2route.get(row['trip_id'])
            if rid:
                h,m,s = row['departure_time'].split(':')
                t = int(h)*60 + int(m) + int(s)/60
                route_times[rid].append(t)

# Build route to median headway (in minutes) properly
headway_map = {}
for rid, times in route_times.items():
    if len(times) > 1:
        sorted_ts = np.sort(times)
        diffs     = np.diff(sorted_ts)
        headway_map[rid] = float(np.median(diffs))

# Fallback map for missing legs
first_leg = (
    df[['RuteIdSeq','first_headway']]
    .assign(first_route=lambda d: d.RuteIdSeq.str.split('-').str[0])
    .drop_duplicates('first_route')
)
fallback_map = dict(zip(first_leg['first_route'], first_leg['first_headway']))

# Explode alternatives → one row per leg
df['route_list'] = df['RuteIdSeq'].str.split('-')
exploded = df[['Obs_ID','Alt_ID','route_list']].explode('route_list')

# Vectorized lookup + fallback
exploded['headway_min'] = (
    exploded['route_list'].map(headway_map)
             .fillna(exploded['route_list'].map(fallback_map))
)

# Compute vehicles/hour
exploded['freq'] = 60.0 / exploded['headway_min']

# Aggregate to journey-level
agg = exploded.groupby(['Obs_ID','Alt_ID'])['freq'].agg(
    freq_min  = 'min',
    freq_avg  = 'mean',
    freq_harm = lambda F: len(F) / F.map(lambda x:1/x).sum()
).reset_index()

# Merge & clean up
df = df.merge(agg, on=['Obs_ID','Alt_ID'], how='left')
df.drop(columns=['first_headway','freq_per_hr','route_list'], inplace=True)



In [None]:
df['n_segments']   = df['RuteIdSeq'].str.count('-') + 1
df['n_stops']      = df['StopNbSequence'].str.count(';') + 1
df['avg_stop_dist']= df['total_distance'] / df['n_stops']


In [None]:
modes_list = df['ModalKomb'].str.split('-')
df['mode_switches'] = modes_list.apply(
    lambda L: sum(1 for i in range(1,len(L)) if L[i] != L[i-1])
)


In [None]:
df['sum_transfer_dist'] = df['sum_transfer_dist']  # from raw
df['avg_transfer_dist'] = df['sum_transfer_dist'] / df['transfers'].replace(0,1)


In [None]:
df['log_OD_demand'] = np.log1p(df['OD_demand'])
df['rel_TT_x_uses_Bus']    = df['rel_TT'] * df['uses_Bus']
df['rel_TT_x_uses_Metro']  = df['rel_TT'] * df['uses_Metro']
df['rel_trans_x_switches']= df['rel_transfers'] * df['mode_switches']
df['rel_walk_x_long_journey'] = df['rel_walk'] * df['long_journey']


In [None]:
raw_cols = [
    'turngl','origin','destination','TT_choice(min)','RuteIdSeq',
    'StopNbSequence','total_distance','transfers_upd','ModalKomb',
    'sum_transfer_dist','Alt_ID','Planned_TT','first_headway',
    'sum_TT_Bus','sum_TT_Tog','sum_TT_Stog','sum_TT_Metro',
    'WalkingTime','Start_Time'
]

# Metadata columns we still want in the final file:
keep_cols = ['Obs_ID','OD','choice']

# Now everything else in df is an engineered feature:
engineered_cols = [c for c in df.columns 
                   if c not in raw_cols + keep_cols]

# Sanity check: no overlap with keep_cols or raw_cols
assert set(engineered_cols).isdisjoint(set(raw_cols + keep_cols))

print(f"Saving {len(keep_cols)} metadata + {len(engineered_cols)} engineered features:")
print("  metadata:", keep_cols)
print("  features:", engineered_cols[:5], "…", engineered_cols[-5:])

# Safe to parquet
out_cols = keep_cols + engineered_cols
df[out_cols].to_parquet('features_full.parquet', index=False)
print("Saved 'features_full.parquet' with columns:")
print(df[out_cols].columns.tolist())
