In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')

## 1. Load Processed Data

In [None]:
# Load features
df = pd.read_csv('data/f1_tyre_features.csv')
print(f"Loaded {len(df):,} samples")
print(f"Features: {df.shape[1] - 1}")
print(f"\nTarget distribution:")
print(df['Compound'].value_counts())
print(f"\nData Info:")
print(df.info())

## 2. Prepare Data for Training

In [None]:
# Separate features and target
X = df.drop('Compound', axis=1)
y = df['Compound']

# Encode target variable
le_compound = LabelEncoder()
y_encoded = le_compound.fit_transform(y)

print(f"Feature shape: {X.shape}")
print(f"Target shape: {y_encoded.shape}")
print(f"\nCompound encoding:")
for i, compound in enumerate(le_compound.classes_):
    print(f"  {compound}: {i}")

# Save label encoder
joblib.dump(le_compound, 'model/label_encoder.pkl')
print("\n‚úì Label encoder saved")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"\nTraining set distribution:")
print(pd.Series(y_train).value_counts().sort_index())
print(f"\nTest set distribution:")
print(pd.Series(y_test).value_counts().sort_index())

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler
joblib.dump(scaler, 'model/scaler.pkl')
print("‚úì Feature scaler saved")

## 3. Train Multiple Models

In [None]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=200, 
        max_depth=20, 
        min_samples_split=10,
        random_state=42,
        n_jobs=-1
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=200,
        max_depth=10,
        learning_rate=0.1,
        random_state=42
    ),
    'XGBoost': XGBClassifier(
        n_estimators=200,
        max_depth=10,
        learning_rate=0.1,
        random_state=42,
        eval_metric='mlogloss'
    )
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    print(f"{'='*60}")
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'predictions': y_pred
    }
    
    print(f"\nTest Accuracy: {accuracy:.4f}")
    print(f"CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le_compound.classes_))

## 4. Select Best Model

In [None]:
# Compare models
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Test Accuracy': [r['accuracy'] for r in results.values()],
    'CV Mean': [r['cv_mean'] for r in results.values()],
    'CV Std': [r['cv_std'] for r in results.values()]
})

print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)
print(comparison_df.to_string(index=False))

# Select best model (highest test accuracy)
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['model']

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   Accuracy: {results[best_model_name]['accuracy']:.4f}")

## 5. Feature Importance Analysis

In [None]:
# Get feature importance from best model
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10).to_string(index=False))
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(15)
    plt.barh(range(len(top_features)), top_features['Importance'])
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Importance')
    plt.title(f'Top 15 Feature Importances - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('data/feature_importance.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print("\n‚úì Feature importance plot saved")

## 6. Confusion Matrix Visualization

In [None]:
# Confusion matrix for best model
cm = confusion_matrix(y_test, results[best_model_name]['predictions'])

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le_compound.classes_,
            yticklabels=le_compound.classes_)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Compound')
plt.xlabel('Predicted Compound')
plt.tight_layout()
plt.savefig('data/confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

print("‚úì Confusion matrix saved")

## 7. Save Best Model

In [None]:
# Save the best model
joblib.dump(best_model, 'model/tyre_recommender.pkl')
joblib.dump(X.columns.tolist(), 'model/feature_columns.pkl')

print("‚úì Model saved to model/tyre_recommender.pkl")
print("‚úì Feature columns saved")

## 8. Test Model with Sample Predictions

In [None]:
# Create sample scenarios for testing
sample_scenarios = [
    {
        'name': 'Hot Weather, Early Race',
        'features': {
            'AirTemp': 35, 'TrackTemp': 45, 'Humidity': 40, 'Rainfall_Binary': 0,
            'TrackType_Encoded': 0, 'TyreSeverity_Encoded': 2, 'TotalCorners': 16, 'TrackLength': 5.0,
            'LapNumber': 5, 'RaceProgress': 0.1, 'Stint': 1, 'TyreLife': 5, 'StintPhase_Encoded': 0,
            'TyreManagementScore': 0.8, 'TyreDegradation': 0.02, 'TempCompoundScore': 5
        }
    },
    {
        'name': 'Cool Weather, Mid Race',
        'features': {
            'AirTemp': 18, 'TrackTemp': 25, 'Humidity': 60, 'Rainfall_Binary': 0,
            'TrackType_Encoded': 0, 'TyreSeverity_Encoded': 1, 'TotalCorners': 18, 'TrackLength': 5.5,
            'LapNumber': 30, 'RaceProgress': 0.5, 'Stint': 2, 'TyreLife': 15, 'StintPhase_Encoded': 1,
            'TyreManagementScore': 0.85, 'TyreDegradation': 0.05, 'TempCompoundScore': -5
        }
    },
    {
        'name': 'Rainy Conditions',
        'features': {
            'AirTemp': 15, 'TrackTemp': 18, 'Humidity': 95, 'Rainfall_Binary': 1,
            'TrackType_Encoded': 1, 'TyreSeverity_Encoded': 0, 'TotalCorners': 20, 'TrackLength': 4.5,
            'LapNumber': 20, 'RaceProgress': 0.35, 'Stint': 2, 'TyreLife': 8, 'StintPhase_Encoded': 1,
            'TyreManagementScore': 0.75, 'TyreDegradation': 0.01, 'TempCompoundScore': 0
        }
    }
]

print("\n" + "="*60)
print("SAMPLE PREDICTIONS")
print("="*60)

for scenario in sample_scenarios:
    # Create feature vector
    features = pd.DataFrame([scenario['features']])[X.columns]
    features_scaled = scaler.transform(features)
    
    # Predict
    prediction = best_model.predict(features_scaled)
    probabilities = best_model.predict_proba(features_scaled)[0]
    
    predicted_compound = le_compound.inverse_transform(prediction)[0]
    
    print(f"\n{scenario['name']}:")
    print(f"  Recommended: {predicted_compound}")
    print(f"  Confidence:")
    for i, compound in enumerate(le_compound.classes_):
        print(f"    {compound}: {probabilities[i]:.2%}")

## 9. Model Summary

In [None]:
print("\n" + "="*60)
print("MODEL TRAINING COMPLETE")
print("="*60)
print(f"\nBest Model: {best_model_name}")
print(f"Test Accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f"CV Accuracy: {results[best_model_name]['cv_mean']:.4f} (+/- {results[best_model_name]['cv_std']:.4f})")
print(f"\nTraining Samples: {len(X_train):,}")
print(f"Test Samples: {len(X_test):,}")
print(f"Features: {X.shape[1]}")
print(f"Classes: {len(le_compound.classes_)}")
print(f"\nSaved Files:")
print("  - model/tyre_recommender.pkl")
print("  - model/scaler.pkl")
print("  - model/label_encoder.pkl")
print("  - model/feature_columns.pkl")
print("\n‚úì Model is ready for deployment!")