In [None]:
# # Install required packages.

# #!pip install torch-geometric==2.6.0 torch-sparse torch-scatter torch-cluster torch-spline-conv pyg-lib -f https://data.pyg.org/whl/torch-2.4.0+cpu.html
# #!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric==2.6.0 pyg-lib -f https://data.pyg.org/whl/torch-2.4.0+cpu.html
# !pip install torch-scatter -f https://data.pyg.org/whl/torch-2.4.0+cpu.html
# !pip install torch-sparse -f https://data.pyg.org/whl/torch-2.4.0+cpu.html
# !pip install torch-cluster -f https://data.pyg.org/whl/torch-2.4.0+cpu.html
# !pip install torch-spline-conv -f https://data.pyg.org/whl/torch-2.4.0+cpu.html
# !pip install torch-geometric==2.6.0 -f https://data.pyg.org/whl/torch-2.4.0+cpu.html
# !pip install pyg-lib -f https://data.pyg.org/whl/torch-2.4.0+cpu.html
# !pip install pytorch_frame[full]==1.2.2
# !pip install relbench[full]==1.0.0
# #!pip install --upgrade torch torchvision transformers
# #!pip install --upgrade torchvision transformers
# #!pip install torch==2.5.0 -f https://download.pytorch.org/whl/cpu/torch_stable.html
# #!pip install pyg_lib -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
# !pip uninstall -y pyg_lib torch  # Uninstall current versions
# !pip install torch==2.6.0  # Reinstall your desired PyTorch version
# !pip install --no-cache-dir git+https://github.com/pyg-team/pyg-lib.git # Install pyg-lib; --no-cache-dir ensures a fresh install

Looking in links: https://data.pyg.org/whl/torch-2.4.0+cpu.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.4.0%2Bcpu/torch_scatter-2.1.2%2Bpt24cpu-cp311-cp311-linux_x86_64.whl (543 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m543.5/543.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt24cpu
Looking in links: https://data.pyg.org/whl/torch-2.4.0+cpu.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.4.0%2Bcpu/torch_sparse-0.6.18%2Bpt24cpu-cp311-cp311-linux_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt24cpu


In [2]:
from relbench.datasets import get_dataset
from relbench.tasks import get_task
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# 1. Caricamento dati con controllo colonne
dataset = get_dataset(name="rel-f1", download=True)
task = get_task("rel-f1", "driver-position", download=True)

# Verifica contenuto tabelle
print("Colonne drivers:", dataset.get_db().table_dict['drivers'].df.columns.tolist())
print("Colonne results:", dataset.get_db().table_dict['results'].df.columns.tolist())

# 2. Estrazione dati con gestione sicura delle colonne
def safe_get_columns(df, columns):
    return df[[col for col in columns if col in df.columns]]

drivers = safe_get_columns(dataset.get_db().table_dict['drivers'].df, ['driverId', 'constructorId'])
results = safe_get_columns(dataset.get_db().table_dict['results'].df, ['driverId', 'raceId', 'grid', 'position', 'constructorId'])
races = safe_get_columns(dataset.get_db().table_dict['races'].df, ['raceId', 'circuitId', 'date'])

# 3. Feature engineering robusto
def create_features(df):
    # Merge a cascata con gestione errori
    try:
        merged = df.merge(results, on='driverId', how='left', suffixes=('', '_result')) \
                   .merge(races, on='raceId', how='left') \
                   .merge(drivers, on='driverId', how='left', suffixes=('', '_driver'))

        # Costruttore: prima da results, poi da drivers (se mancante)
        if 'constructorId' not in merged:
            merged['constructorId'] = merged.get('constructorId_driver', merged.get('constructorId_result', None))

        # Calcolo features con fallback
        merged['driver_avg'] = merged.groupby('driverId')['position'].transform('mean').fillna(20)
        merged['circuit_avg'] = merged.groupby(['driverId', 'circuitId'])['position'].transform('mean').fillna(20)
        merged['constructor_avg'] = merged.groupby('constructorId')['position'].transform('mean').fillna(20)

        features = merged[['driverId', 'circuitId', 'constructorId',
                         'driver_avg', 'circuit_avg', 'constructor_avg',
                         'grid']].copy()
        features.fillna({'grid': 20}, inplace=True)

        return features, merged['position']
    except Exception as e:
        print(f"Errore durante il merge: {str(e)}")
        raise

# 4. Caricamento e preparazione tabelle
train_table = task.get_table("train").df
val_table = task.get_table("val").df
test_table = task.get_table("test").df

# Aggiungi date alle tabelle principali per il merge
train_table['date'] = pd.to_datetime(train_table['date'])
val_table['date'] = pd.to_datetime(val_table['date'])
test_table['date'] = pd.to_datetime(test_table['date'])

# 5. Creazione dataset
X_train, y_train = create_features(train_table)
X_val, y_val = create_features(val_table)
X_test, y_test = create_features(test_table)

# 6. Pipeline con gestione valori mancanti
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'),
        ['driverId', 'circuitId', 'constructorId'])
    ],
    remainder='passthrough'
)

model = make_pipeline(
    preprocessor,
    RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1
    )
)

# 7. Addestramento con logging
print("\nInizio addestramento...")
print(f"Dimensione training set: {len(X_train)} esempi")
model.fit(X_train, y_train)

# 8. Valutazione
val_pred = model.predict(X_val)
test_pred = model.predict(X_test)

print("\nRisultati:")
print(f"Validation MAE: {mean_absolute_error(y_val, val_pred):.2f}")
#print(f"Test MAE: {mean_absolute_error(y_test, test_pred):.2f}")

Colonne drivers: ['driverId', 'driverRef', 'code', 'forename', 'surname', 'dob', 'nationality']
Colonne results: ['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid', 'position', 'positionOrder', 'points', 'laps', 'milliseconds', 'fastestLap', 'rank', 'statusId', 'date']

Inizio addestramento...
Dimensione training set: 652621 esempi

Risultati:
Validation MAE: 2.92
