In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Raw Data

In [None]:
# Load the collected data
df = pd.read_csv('data/f1_tyre_data.csv')
print(f"Loaded {len(df):,} rows")
print("\nData Overview:")
print(df.head())
print("\nData Info:")
print(df.info())

## 2. Create Track Characteristics Database

In [None]:
# Manual track characteristics (based on F1 data)
track_chars = {
    'Bahrain': {'type': 'desert', 'severity': 'high', 'corners': 15, 'length': 5.412},
    'Saudi Arabia': {'type': 'street', 'severity': 'high', 'corners': 27, 'length': 6.174},
    'Australia': {'type': 'street', 'severity': 'medium', 'corners': 14, 'length': 5.278},
    'Azerbaijan': {'type': 'street', 'severity': 'medium', 'corners': 20, 'length': 6.003},
    'Miami': {'type': 'street', 'severity': 'high', 'corners': 19, 'length': 5.412},
    'Monaco': {'type': 'street', 'severity': 'low', 'corners': 19, 'length': 3.337},
    'Spain': {'type': 'permanent', 'severity': 'high', 'corners': 16, 'length': 4.675},
    'Canada': {'type': 'street', 'severity': 'medium', 'corners': 14, 'length': 4.361},
    'Austria': {'type': 'permanent', 'severity': 'medium', 'corners': 10, 'length': 4.318},
    'Great Britain': {'type': 'permanent', 'severity': 'high', 'corners': 18, 'length': 5.891},
    'Hungary': {'type': 'permanent', 'severity': 'high', 'corners': 14, 'length': 4.381},
    'Belgium': {'type': 'permanent', 'severity': 'medium', 'corners': 19, 'length': 7.004},
    'Netherlands': {'type': 'permanent', 'severity': 'high', 'corners': 14, 'length': 4.259},
    'Italy': {'type': 'permanent', 'severity': 'low', 'corners': 11, 'length': 5.793},
    'Singapore': {'type': 'street', 'severity': 'high', 'corners': 23, 'length': 4.940},
    'Japan': {'type': 'permanent', 'severity': 'medium', 'corners': 18, 'length': 5.807},
    'Qatar': {'type': 'permanent', 'severity': 'high', 'corners': 16, 'length': 5.380},
    'United States': {'type': 'permanent', 'severity': 'high', 'corners': 20, 'length': 5.513},
    'Mexico': {'type': 'permanent', 'severity': 'medium', 'corners': 17, 'length': 4.304},
    'Brazil': {'type': 'permanent', 'severity': 'high', 'corners': 15, 'length': 4.309},
    'Las Vegas': {'type': 'street', 'severity': 'low', 'corners': 17, 'length': 6.120},
    'Abu Dhabi': {'type': 'permanent', 'severity': 'high', 'corners': 16, 'length': 5.281},
}

# Convert to DataFrame
track_df = pd.DataFrame.from_dict(track_chars, orient='index').reset_index()
track_df.columns = ['Country', 'TrackType', 'TyreSeverity', 'TotalCorners', 'TrackLength']

# Save track characteristics
track_df.to_csv('data/track_characteristics.csv', index=False)
print("✓ Track characteristics saved")
print("\nTrack Characteristics:")
print(track_df)

## 3. Merge Track Characteristics with Race Data

In [None]:
# Merge track data
df = df.merge(track_df, on='Country', how='left')

# Fill missing track data with defaults
df['TrackType'] = df['TrackType'].fillna('permanent')
df['TyreSeverity'] = df['TyreSeverity'].fillna('medium')
df['TotalCorners'] = df['TotalCorners'].fillna(df['TotalCorners'].median())
df['TrackLength'] = df['TrackLength'].fillna(df['TrackLength'].median())

print("✓ Track characteristics merged")
print(f"Final dataset shape: {df.shape}")

## 4. Engineer Driver Performance Features

In [None]:
# Calculate driver statistics per compound and track
driver_stats = df.groupby(['Driver', 'Compound']).agg({
    'LapTime': ['mean', 'std', 'count'],
    'TyreLife': 'mean'
}).reset_index()

driver_stats.columns = ['Driver', 'Compound', 'AvgLapTime', 'StdLapTime', 'LapCount', 'AvgTyreLife']

# Driver tyre management score (lower std = better management)
driver_stats['TyreManagementScore'] = 1 / (1 + driver_stats['StdLapTime'])

# Merge back to main dataframe
df = df.merge(driver_stats[['Driver', 'Compound', 'TyreManagementScore']], 
              on=['Driver', 'Compound'], how='left')

print("✓ Driver performance features added")

## 5. Create Time-Based Features

In [None]:
# Race progress (as percentage)
race_lap_counts = df.groupby(['Year', 'Round', 'Driver'])['LapNumber'].max().reset_index()
race_lap_counts.columns = ['Year', 'Round', 'Driver', 'TotalLaps']

df = df.merge(race_lap_counts, on=['Year', 'Round', 'Driver'], how='left')
df['RaceProgress'] = df['LapNumber'] / df['TotalLaps']

# Stint phase (beginning, middle, end)
df['StintPhase'] = pd.cut(df['TyreLife'], bins=[0, 5, 15, 100], 
                          labels=['early', 'middle', 'late'])

print("✓ Time-based features added")

## 6. Create Tyre Performance Metrics

In [None]:
# Calculate lap time degradation for each stint
def calculate_degradation(group):
    if len(group) < 2:
        return pd.Series([0] * len(group))
    
    # Calculate degradation as lap time increase per lap
    first_lap = group.iloc[0]['LapTime']
    degradation = [(lap - first_lap) / group.iloc[i]['TyreLife'] 
                   if group.iloc[i]['TyreLife'] > 0 else 0 
                   for i, lap in enumerate(group['LapTime'])]
    return pd.Series(degradation, index=group.index)

# Group by stint and calculate degradation
df = df.sort_values(['Year', 'Round', 'Driver', 'Stint', 'LapNumber'])
df['TyreDegradation'] = df.groupby(['Year', 'Round', 'Driver', 'Stint']).apply(
    calculate_degradation
).reset_index(level=[0, 1, 2, 3], drop=True)

# Temperature effect on compound
df['TempCompoundScore'] = 0
# Soft works better in cooler temps
df.loc[df['Compound'] == 'SOFT', 'TempCompoundScore'] = 30 - df.loc[df['Compound'] == 'SOFT', 'TrackTemp']
# Hard works better in hot temps
df.loc[df['Compound'] == 'HARD', 'TempCompoundScore'] = df.loc[df['Compound'] == 'HARD', 'TrackTemp'] - 30
# Medium is balanced
df.loc[df['Compound'] == 'MEDIUM', 'TempCompoundScore'] = 0

print("✓ Tyre performance metrics added")

## 7. Encode Categorical Variables

In [None]:
# Label encode categorical variables
le_track_type = LabelEncoder()
le_severity = LabelEncoder()
le_stint_phase = LabelEncoder()

df['TrackType_Encoded'] = le_track_type.fit_transform(df['TrackType'])
df['TyreSeverity_Encoded'] = le_severity.fit_transform(df['TyreSeverity'])
df['StintPhase_Encoded'] = le_stint_phase.fit_transform(df['StintPhase'].astype(str))

# Create dummy for rainfall
df['Rainfall_Binary'] = df['Rainfall'].astype(int)

print("✓ Categorical encoding completed")

## 8. Feature Selection and Final Dataset

In [None]:
# Select features for modeling
feature_columns = [
    # Weather features
    'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall_Binary',
    
    # Track features
    'TrackType_Encoded', 'TyreSeverity_Encoded', 'TotalCorners', 'TrackLength',
    
    # Race context
    'LapNumber', 'RaceProgress', 'Stint', 'TyreLife', 'StintPhase_Encoded',
    
    # Driver & Performance
    'TyreManagementScore', 'TyreDegradation', 'TempCompoundScore',
    
    # Target
    'Compound'
]

# Create final dataset
df_features = df[feature_columns].copy()

# Remove rows with missing values
df_features = df_features.dropna()

print(f"Final feature dataset shape: {df_features.shape}")
print(f"\nFeatures: {len(feature_columns)-1}")
print(f"Samples: {len(df_features):,}")
print(f"\nTarget distribution:")
print(df_features['Compound'].value_counts())

## 9. Visualize Feature Distributions

In [None]:
# Plot compound usage by temperature
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Air temp vs compound
df_features.boxplot(column='AirTemp', by='Compound', ax=axes[0])
axes[0].set_title('Air Temperature by Compound')
axes[0].set_xlabel('Compound')
axes[0].set_ylabel('Air Temperature (°C)')

# Track temp vs compound
df_features.boxplot(column='TrackTemp', by='Compound', ax=axes[1])
axes[1].set_title('Track Temperature by Compound')
axes[1].set_xlabel('Compound')
axes[1].set_ylabel('Track Temperature (°C)')

plt.suptitle('')
plt.tight_layout()
plt.savefig('data/temp_compound_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Visualization saved")

In [None]:
# Correlation heatmap of numeric features
numeric_features = df_features.select_dtypes(include=[np.number]).columns.tolist()
numeric_features.remove('Compound') if 'Compound' in numeric_features else None

plt.figure(figsize=(14, 10))
correlation_matrix = df_features[numeric_features].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Feature Correlation Matrix', fontsize=16, pad=20)
plt.tight_layout()
plt.savefig('data/correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Correlation matrix saved")

## 10. Save Processed Data

In [None]:
# Save processed features
df_features.to_csv('data/f1_tyre_features.csv', index=False)
print("✓ Features saved to data/f1_tyre_features.csv")

# Save feature names
import joblib
feature_names = [col for col in feature_columns if col != 'Compound']
joblib.dump(feature_names, 'model/feature_names.pkl')
print("✓ Feature names saved")

# Summary
print("\n" + "="*60)
print("FEATURE ENGINEERING COMPLETE")
print("="*60)
print(f"Total Features: {len(feature_names)}")
print(f"Total Samples: {len(df_features):,}")
print(f"\nFeature List:")
for i, feat in enumerate(feature_names, 1):
    print(f"  {i:2d}. {feat}")
print("\n✓ Ready for model training!")