In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
def get_modern_main_points(pos):
    """Main Race points: 1st place 25 points ~ 10th place 1 point"""
    points = {1: 25, 2: 18, 3: 15, 4: 12, 5: 10, 6: 8, 7: 6, 8: 4, 9: 2, 10: 1}
    return points.get(pos, 0)

In [3]:
def get_modern_sprint_points(pos):
    """Sprint Race points: 1st place 8 points ~ 8th place 1 point"""
    if pos <= 8:
        return 9 - pos
    return 0

In [4]:
def time_to_ms(time_str):
    """'1:23.456' Or '1:34:50.616' format to milliseconds (ms)"""
    if pd.isna(time_str) or time_str == r'\N' or time_str == '' or time_str == '0':
        return np.nan
    try:
        time_str = str(time_str).strip()
        if ':' in time_str:
            parts = time_str.split(':')
            if len(parts) == 3: # h:m:s.ms
                return int(parts[0])*3600000 + int(parts[1])*60000 + float(parts[2])*1000
            return int(parts[0])*60000 + float(parts[1])*1000
        return float(time_str)*1000
    except: return np.nan

In [5]:
def preprocess_for_sprint_simulation(data_path='../data'):
    print("1. Data loading and initialization...")
    results = pd.read_csv(os.path.join(data_path, 'results.csv'))
    sprint = pd.read_csv(os.path.join(data_path, 'sprint_results.csv'))
    races = pd.read_csv(os.path.join(data_path, 'races.csv'))
    drivers = pd.read_csv(os.path.join(data_path, 'drivers.csv'))
    qualifying = pd.read_csv(os.path.join(data_path, 'qualifying.csv'))
    pit_stops = pd.read_csv(os.path.join(data_path, 'pit_stops.csv'))
    
    results['unified_main_points'] = results['positionOrder'].apply(get_modern_main_points)

    
    print("2. Feature Engineering...")
    
    results = results.rename(columns={
        'milliseconds': 'res_ms', 
        'fastestLapTime': 'res_fLapTime',
        'rank': 'res_rank'
    })
    results['res_fLapTime_ms'] = results['res_fLapTime'].apply(time_to_ms)
    
    
    sprint = sprint.rename(columns={
        'milliseconds': 'spr_ms', 
        'positionOrder': 'sprint_target',
        'fastestLapTime': 'spr_fLapTime'
    })
    sprint['spr_fLapTime_ms'] = sprint['spr_fLapTime'].apply(time_to_ms)
    
    sprint['spr_points_unified'] = sprint['sprint_target'].apply(get_modern_sprint_points)
    
    
    pit_stops = pit_stops.rename(columns={'milliseconds': 'pit_ms'})
    pit_summary = pit_stops.groupby(['raceId', 'driverId']).agg({
        'stop': 'max',
        'pit_ms': 'sum'
    }).reset_index().rename(columns={'stop': 'pit_count', 'pit_ms': 'pit_total_ms'})

    print("3. Era Normalization (q_rel_gap) calculation...")
    for col in ['q1', 'q2', 'q3']:
        qualifying[col] = qualifying[col].apply(time_to_ms)
    
    
    qualifying['pole_time'] = qualifying.groupby('raceId')['q1'].transform('min')
    qualifying['q_rel_gap'] = (qualifying['q1'] - qualifying['pole_time']) / qualifying['pole_time']

    
    print("4. Merging master dataset...")
    
    df = pd.merge(results[['raceId', 'driverId', 'constructorId', 'grid', 'positionOrder', 'unified_main_points', 'res_fLapTime_ms']], 
                  races[['raceId', 'year', 'round', 'circuitId']], on='raceId')
    
    
    df = pd.merge(df, drivers[['driverId', 'dob', 'driverRef']], on='driverId')
    
    
    df = pd.merge(df, qualifying[['raceId', 'driverId', 'q_rel_gap', 'position']], 
                  on=['raceId', 'driverId'], how='left').rename(columns={'position': 'qual_pos'})
    
    
    df = pd.merge(df, sprint[['raceId', 'driverId', 'sprint_target', 'spr_ms', 'spr_fLapTime_ms', 'spr_points_unified']], 
                  on=['raceId', 'driverId'], how='left')
    
    
    df = pd.merge(df, pit_summary, on=['raceId', 'driverId'], how='left')

    print("5. Driver age and categorical variable processing...")
    
    df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
    df['age_at_race'] = df['year'] - df['dob'].dt.year
    df['dob'] = df['age_at_race'] 
    
    
    le = LabelEncoder()
    df['driver_encoded'] = le.fit_transform(df['driverRef'].astype(str))

    
    drop_cols = ['age_at_race', 'driverRef', 'number', 'positionText', 'res_fLapTime', 'statusId']
    df = df.drop(columns=[c for c in drop_cols if c in df.columns])

    
    df = df.replace(r'\N', np.nan)
    df = df.apply(pd.to_numeric, errors='coerce')
    
    
    df['q_rel_gap'] = df['q_rel_gap'].fillna(df['q_rel_gap'].mean())
    df['qual_pos'] = df['qual_pos'].fillna(20)
    df = df.fillna(0)

    
    df = df.sort_values(['driverId', 'year', 'round'])

    print("6. Splitting modern/historical datasets and scaling...")
    
    modern_train = df[(df['year'] >= 2021) & (df['sprint_target'] > 0)].copy()
    
    historical_sim = df[df['year'] < 2021].copy()

    
    scaler = MinMaxScaler()
    
    feature_cols = ['grid', 'qual_pos', 'q_rel_gap', 'dob', 'constructorId', 'circuitId', 'pit_count']
    
    if not modern_train.empty:
        
        modern_train[feature_cols] = scaler.fit_transform(modern_train[feature_cols])
        historical_sim[feature_cols] = scaler.transform(historical_sim[feature_cols])

    
    modern_train.to_csv('../data_engineered/modern_train_ready.csv', index=False)
    historical_sim.to_csv('../data_engineered/historical_sim_ready.csv', index=False)
    
    print(f"\n--- Preprocessing Complete ---")
    print(f"Training Data (2021+): {len(modern_train)} rows")
    print(f"Simulation Data (Historical): {len(historical_sim)} rows")
    print(f"Key Columns: {df.columns.tolist()[:10]} ...")
    
    return modern_train, historical_sim


In [6]:
def preprocess_for_sprint_simulation_v2(data_path='../data'):
    print("1. Data loading and initialization...")
    results = pd.read_csv(os.path.join(data_path, 'results.csv'))
    sprint = pd.read_csv(os.path.join(data_path, 'sprint_results.csv'))
    races = pd.read_csv(os.path.join(data_path, 'races.csv'))
    drivers = pd.read_csv(os.path.join(data_path, 'drivers.csv'))
    qualifying = pd.read_csv(os.path.join(data_path, 'qualifying.csv'))

    
    results['unified_main_points'] = results['positionOrder'].apply(get_modern_main_points)
    
    
    print("2. Advanced feature engineering in progress...")
    full_data = pd.merge(results, races[['raceId', 'year', 'round', 'circuitId']], on='raceId')
    full_data = full_data.sort_values(['driverId', 'year', 'round'])

    
    full_data['driver_form'] = full_data.groupby('driverId')['unified_main_points']\
        .transform(lambda x: x.rolling(window=5, min_periods=1).mean().shift(1))
    
    team_points = full_data.groupby(['raceId', 'constructorId'])['unified_main_points'].sum().reset_index()
    team_points['team_form'] = team_points.groupby('constructorId')['unified_main_points']\
        .transform(lambda x: x.rolling(window=5, min_periods=1).mean().shift(1))
    full_data = pd.merge(full_data, team_points[['raceId', 'constructorId', 'team_form']], on=['raceId', 'constructorId'])
    
    full_data['driver_experience'] = full_data.groupby('driverId').cumcount()

    
    print("3. Merging master dataset...")
    
    
    for col in ['q1', 'q2', 'q3']:
        qualifying[col] = qualifying[col].apply(time_to_ms)
    
    qualifying['pole_time'] = qualifying.groupby('raceId')['q1'].transform('min')
    qualifying['q_rel_gap'] = (qualifying['q1'] - qualifying['pole_time']) / qualifying['pole_time']
    
    
    
    qual_subset = qualifying[['raceId', 'driverId', 'q_rel_gap', 'position']].rename(columns={'position': 'qual_pos'})
    df = pd.merge(full_data, qual_subset, on=['raceId', 'driverId'], how='left')

    
    df = pd.merge(df, drivers[['driverId', 'dob', 'driverRef']], on='driverId')
    df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
    df['age_at_race'] = df['year'] - df['dob'].dt.year

    
    sprint_min = sprint[['raceId', 'driverId', 'positionOrder']].rename(columns={'positionOrder': 'sprint_target'})
    df = pd.merge(df, sprint_min, on=['raceId', 'driverId'], how='left')

    
    
    if 'qual_pos' not in df.columns:
        df['qual_pos'] = np.nan
    
    
    df['qual_pos'] = df['qual_pos'].fillna(df['grid'])
    df['qual_pos'] = df['qual_pos'].replace(0, 20).fillna(20)
    
    if 'q_rel_gap' not in df.columns:
        df['q_rel_gap'] = 0.0
    df['q_rel_gap'] = df['q_rel_gap'].fillna(df['q_rel_gap'].mean() if not df['q_rel_gap'].isna().all() else 0) 

    
    df['driver_form'] = df['driver_form'].fillna(0)
    df['team_form'] = df['team_form'].fillna(0)
    
    
    df['circuit_avg_pos'] = df.groupby(['driverId', 'circuitId'])['positionOrder']\
        .transform(lambda x: x.expanding().mean().shift(1)).fillna(10)

    
    le = LabelEncoder()
    df['driver_encoded'] = le.fit_transform(df['driverRef'].astype(str))

    
    feature_cols = [
        'grid', 'qual_pos', 'q_rel_gap', 'age_at_race', 
        'driver_form', 'team_form', 'driver_experience', 'circuit_avg_pos'
    ]
    
    
    modern_train = df[(df['year'] >= 2021) & (df['sprint_target'].notna())].copy()
    
    historical_sim = df[df['year'] < 2021].copy()

    
    scaler = MinMaxScaler()
    if not modern_train.empty:
        modern_train[feature_cols] = scaler.fit_transform(modern_train[feature_cols])
        if not historical_sim.empty:
            historical_sim[feature_cols] = scaler.transform(historical_sim[feature_cols])

    
    modern_train.to_csv('../data_engineered/modern_train_ready_v2.csv', index=False)
    historical_sim.to_csv('../data_engineered/historical_sim_ready_v2.csv', index=False)
    
    print("\n--- Preprocessing Complete ---")
    return modern_train, historical_sim

In [7]:
if __name__ == "__main__":
    modern_ready, history_ready = preprocess_for_sprint_simulation()

    modern_ready_v2, history_ready_v2 = preprocess_for_sprint_simulation_v2()


1. Data loading and initialization...
2. Feature Engineering...
3. Era Normalization (q_rel_gap) calculation...
4. Merging master dataset...
5. Driver age and categorical variable processing...
6. Splitting modern/historical datasets and scaling...

--- Preprocessing Complete ---
Training Data (2021+): 360 rows
Simulation Data (Historical): 24960 rows
Key Columns: ['raceId', 'driverId', 'constructorId', 'grid', 'positionOrder', 'unified_main_points', 'res_fLapTime_ms', 'year', 'round', 'circuitId'] ...
1. Data loading and initialization...
2. Advanced feature engineering in progress...
3. Merging master dataset...

--- Preprocessing Complete ---
