# Data Load


In [None]:
import pandas as pd

CHOICE_SET_CSV = r'C:\Users\Chris\Desktop\DTU\8. Semester\Bachelorprojekt-1\Data\Final\choice_set_Final_1_15_Sep23.csv'
CHOSEN_CSV     = r'C:\Users\Chris\Desktop\DTU\8. Semester\Bachelorprojekt-1\Data\Final\df_Sep1_15_SEP23_MORNING.csv'

# Read both files and merge on the unique trip key “turngl”
df_choices = pd.read_csv(CHOICE_SET_CSV)
df_chosen  = pd.read_csv(CHOSEN_CSV, parse_dates=['Start_Time'])

df = (
    df_choices
    .merge(df_chosen[['turngl','Start_Time']], on='turngl', how='left')
)


# Absolute Travel‐Time & Transfers

Total in-vehicle + walking time, plus raw transfers.
These are the “cost” measures.


In [None]:
# In-vehicle + walking → total travel time
tt_cols = ['sum_TT_Bus','sum_TT_Metro','sum_TT_Tog','sum_TT_Stog']
df['in_vehicle_time'] = df[tt_cols].sum(axis=1)
df['walking_time']    = df['WalkingTime']
df['TT_total']        = df['in_vehicle_time'] + df['walking_time']

# Number of transfers
df['transfers'] = df['transfers_upd']


# Relative Travel‐Time, Transfers & Walking

Normalize each cost within the same journey (Obs_ID) to a [0–1] scale


In [None]:
# Group by journey
g = df.groupby('Obs_ID')

# Excess TT relative to the fastest alt
df['best_TT']   = g['TT_total'].transform('min')
df['excess_TT'] = df['TT_total'] - df['best_TT']
df['rel_TT']    = df['excess_TT'] / (
                    g['TT_total'].transform('max') - df['best_TT']
                 )

# Normalized transfers
df['rel_transfers'] = (
    (df['transfers'] - g['transfers'].transform('min'))
    / (g['transfers'].transform('max') - g['transfers'].transform('min'))
)

# Normalized walking
df['rel_walk'] = (
    (df['walking_time'] - g['walking_time'].transform('min'))
    / (g['walking_time'].transform('max') - g['walking_time'].transform('min'))
)

# Drop intermediates we no longer need
df.drop(columns=['best_TT'], inplace=True)


# Service Frequency & Relative Frequency

Routes with more frequent service are more attractive.
Headway-based frequency and its relative position.


In [None]:
# Absolute frequency (trips per hour)
df['freq_per_hr'] = 60.0 / df['first_headway']

# Normalize within journey
df['rel_freq'] = (
    (df['freq_per_hr'] - g['freq_per_hr'].transform('min'))
    / (g['freq_per_hr'].transform('max') - g['freq_per_hr'].transform('min'))
)


# Cityringen Usage Indicator

Instead of origin‐on‐M3 (constant), we flag *each* alternative by whether it *uses* the Cityringen.


In [None]:
df['uses_M3'] = df['RuteIdSeq'].str.contains(r'\bM3\b', regex=True).astype(int)


# Modal One-Hots

Capture the most common mode‐combinations explicitly.


In [None]:
top5 = df['ModalKomb'].value_counts().nlargest(5).index
for combo in top5:
    safe = combo.replace('+','_').replace('-','_')
    df[f'mode_{safe}'] = (df['ModalKomb'] == combo).astype(int)


# Interaction Features

Build a small set of within-set interactions that reflect conditional trade-offs:
- **rel_TT × rel_transfers**  
- **rel_TT × rel_freq**  
- **uses_M3 × rel_TT**  
- **rel_transfers × rel_walk**


In [None]:
df['rel_TT_x_rel_transfers'] = df['rel_TT'] * df['rel_transfers']
df['uses_M3_x_rel_TT']       = df['uses_M3'] * df['rel_TT']
df['rel_transfers_x_rel_walk'] = df['rel_transfers'] * df['rel_walk']



# Preview Selected Features

Show summary stats for our final feature set before exporting.


In [None]:
feature_cols = [
    # absolute & relative costs
    'TT_total','excess_TT','rel_TT',
    # transfers
    'transfers','rel_transfers',
    # walking
    'walking_time','rel_walk',
    # frequency
    'freq_per_hr','rel_freq',
    # Cityringen usage
    'uses_M3','uses_M3_x_rel_TT',
    # mode identity
] + [f"mode_{c.replace('+','_').replace('-','_')}" for c in top5] + [
    # interactions
    'rel_TT_x_rel_transfers','rel_TT_x_rel_freq','rel_transfers_x_rel_walk'
]

print("Final feature summary:\n", df[feature_cols].describe().T)


# Save Features for Modeling

Export as Parquet so the Random Forest / Neural Net notebook can load it directly.


In [None]:
df_model = df[feature_cols + ['choice','OD','Obs_ID']].copy()
df_model.to_parquet('features_final.parquet', index=False)
print(f"Saved {len(feature_cols)} features + metadata to 'features_final.parquet'")


# Short discussion on interaction features:
## rel_TT_x_rel_transfers
This captures how travel time and number of transfers aren't totally separate in a rider’s experience. A route that’s both slower and requires more transfers feels extra inconvenient. But sometimes, riders are willing to make an extra transfer if it saves time—this interaction helps the model pick up on those kinds of trade-offs.

## rel_TT_x_rel_freq
This tests if travel time penalties matter less when the service is very frequent. For instance, adding 2 minutes to a train that comes every 2 minutes might not bother most people. But on a bus that only shows up every 20 minutes, that same delay can seriously hurt its appeal.

## rel_transfers_x_rel_walk
Here we’re looking at the combo of transferring and walking. One transfer with a long walk might be more annoying than two transfers with short walks. This feature helps the model pick up on those subtle differences and better reflect how people actually experience these trips.