A. Dati del Circuito (circuits)
country:

Codifica one-hot per catturare pattern specifici (es. piloti bravi in circuiti stradali come Monaco).

Coordinate geografiche (lat, lng, alt):

Influenza su prestazioni (es. motori turbo a alta quota in Messico).

B. Prestazioni Recenti (standings, constructor_standings)
Punti e vittorie (points, wins):

Media mobile ultime 5 gare per valutare la "forma" attuale.

Posizione nel campionato (position in standings):

Un pilota in top 3 potrebbe essere più aggressivo.

C. Qualifiche (qualifying)
Gap qualifica-gara (position vs grid):

Se un pilota parte 5° ma in qualifica era 1°, potrebbe avere penalità (motore/gearbox).

D. Affidabilità (results.statusId)
Tasso di ritiro per pilota/team:

Se un team ha il 30% di ritiri, riduci la previsione di posizione.

In [2]:
from relbench.datasets import get_dataset
from relbench.tasks import get_task
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import numpy as np

# 1. Caricamento dati completo
dataset = get_dataset(name="rel-f1", download=True)
task = get_task("rel-f1", "driver-position", download=True)

# Estrazione di TUTTE le tabelle necessarie
def safe_get_table(table_name, columns=None):
    df = dataset.get_db().table_dict[table_name].df
    return df[columns] if columns else df

drivers = safe_get_table('drivers', ['driverId', 'constructorId', 'dob', 'nationality'])
results = safe_get_table('results', ['resultId', 'raceId', 'driverId', 'constructorId', 'grid', 'position', 'statusId', 'fastestLap', 'rank'])
races = safe_get_table('races', ['raceId', 'circuitId', 'date', 'year', 'round'])
qualifying = safe_get_table('qualifying', ['raceId', 'driverId', 'position'])
standings = safe_get_table('standings', ['raceId', 'driverId', 'points', 'wins', 'position'])
circuits = safe_get_table('circuits', ['circuitId', 'country', 'lat', 'lng', 'alt'])
constructor_standings = safe_get_table('constructor_standings', ['raceId', 'constructorId', 'points', 'wins'])

# 2. Feature Engineering Avanzato
def create_enhanced_features(df):
    # Merge a cascata con tutte le tabelle
    merged = (
        df.merge(results, on=['raceId', 'driverId'], how='left')
          .merge(qualifying, on=['raceId', 'driverId'], how='left', suffixes=('', '_qual'))
          .merge(standings, on=['raceId', 'driverId'], how='left', suffixes=('', '_stand'))
          .merge(races, on='raceId', how='left')
          .merge(circuits, on='circuitId', how='left')
          .merge(drivers, on='driverId', how='left')
          .merge(constructor_standings, on=['raceId', 'constructorId'], how='left', suffixes=('', '_const'))
    )
    
    # Calcolo feature esistenti
    merged['driver_avg'] = merged.groupby('driverId')['position'].transform('mean').fillna(20)
    merged['circuit_avg'] = merged.groupby(['driverId', 'circuitId'])['position'].transform('mean').fillna(20)
    merged['constructor_avg'] = merged.groupby('constructorId')['position'].transform('mean').fillna(20)
    
    # Nuove feature avanzate
    merged['recent_points'] = merged.groupby('driverId')['points'].transform(lambda x: x.rolling(3, min_periods=1).mean())
    merged['qualifying_gap'] = (merged['grid'] - merged['position_qual']).fillna(0)
    merged['reliability'] = merged.groupby('constructorId')['statusId'].transform(lambda x: (x == 1).mean()).fillna(0.9)  # 1 = Finished?
    merged['age_at_race'] = (pd.to_datetime(merged['date']) - pd.to_datetime(merged['dob'])).dt.days / 365
    merged['season_progress'] = merged['round'] / merged.groupby('year')['round'].transform('max')
    
    # Feature tecniche
    merged['fast_lap_ability'] = (merged['rank'] == 1).astype(int).fillna(0)
    merged['constructor_momentum'] = merged.groupby('constructorId')['points_const'].transform(lambda x: x.diff().rolling(3).mean()).fillna(0)
    
    # Selezione feature finali
    features = merged[[
        'driverId', 'constructorId', 'circuitId', 'grid',
        'driver_avg', 'circuit_avg', 'constructor_avg',
        'recent_points', 'qualifying_gap', 'reliability',
        'age_at_race', 'season_progress', 'fast_lap_ability',
        'constructor_momentum', 'country'
    ]]
    
    return features, merged['position']

# 3. Preparazione dati
train_table = task.get_table("train").df
val_table = task.get_table("val").df
test_table = task.get_table("test").df

# Aggiungi date per calcoli temporali
for table in [train_table, val_table, test_table]:
    table['date'] = pd.to_datetime(table['date'])

# Creazione dataset
X_train, y_train = create_enhanced_features(train_table)
X_val, y_val = create_enhanced_features(val_table)
X_test, y_test = create_enhanced_features(test_table)

# 4. Pipeline avanzata
numeric_features = [
    'grid', 'driver_avg', 'circuit_avg', 'constructor_avg',
    'recent_points', 'qualifying_gap', 'reliability',
    'age_at_race', 'season_progress', 'fast_lap_ability',
    'constructor_momentum'
]

categorical_features = [
    'driverId', 'constructorId', 'circuitId', 'country'
]

preprocessor = ColumnTransformer([
    ('num', SimpleImputer(strategy='median'), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
])

model = make_pipeline(
    preprocessor,
    RandomForestRegressor(
        n_estimators=500,
        max_depth=15,
        min_samples_leaf=3,
        max_features=0.5,
        random_state=42,
        n_jobs=-1
    )
)

# 5. Addestramento e valutazione
print("\nAddestramento modello avanzato...")
print(f"Training samples: {len(X_train)}, Features: {len(numeric_features + categorical_features)}")
model.fit(X_train, y_train)

val_pred = model.predict(X_val)
test_pred = model.predict(X_test)

print("\nRisultati avanzati:")
print(f"Validation MAE: {mean_absolute_error(y_val, val_pred):.2f}")
print(f"Test MAE: {mean_absolute_error(y_test, test_pred):.2f}")

# 6. Analisi feature importance (opzionale)
if hasattr(model.steps[1][1], 'feature_importances_'):
    feature_names = (
        numeric_features + 
        list(model.named_steps['columntransformer'].named_transformers_['cat'].get_feature_names_out())
    )
    print("\nTop 10 feature importanti:")
    for name, score in sorted(zip(feature_names, model.steps[1][1].feature_importances_), 
                             key=lambda x: x[1], reverse=True)[:10]:
        print(f"{name}: {score:.3f}")

KeyError: "['constructorId'] not in index"