# Model Selection for SLP Prediction

This notebook performs model selection to predict the `slp` column using various machine learning algorithms with time series cross-validation.


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import clone
from lightgbm import LGBMRegressor
from joblib import Parallel, delayed
import joblib

import itertools
import warnings
warnings.filterwarnings('ignore')


## 1. Load and Prepare Data


In [2]:
# Load the dataset
dataset = 'dataset/data_v3_stat.csv'
df = pd.read_csv(dataset, sep=';', decimal=',')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

Dataset shape: (1460, 15)

Columns: ['date', 'slp', 'temperature_2m_mean', 'sunrise', 'et0_fao_evapotranspiration', 'sunshine_duration', 'snowfall_sum', 'day_of_year', 'precipitation_hours', 'weathercode', 'windspeed_10m_max', 'rain_sum', 'holiday', 'day_of_year_sin', 'day_of_year_cos']


Unnamed: 0,date,slp,temperature_2m_mean,sunrise,et0_fao_evapotranspiration,sunshine_duration,snowfall_sum,day_of_year,precipitation_hours,weathercode,windspeed_10m_max,rain_sum,holiday,day_of_year_sin,day_of_year_cos
0,2021-10-01,638555.753,0.271326,0.132183,0.290285,0.515279,-0.160809,274,-0.763233,3,-0.068171,-0.475894,0,-0.999963,-0.008583
1,2021-10-02,556131.836,0.47233,0.153844,0.055099,0.393595,-0.160809,275,-0.763233,3,0.28256,-0.475894,0,-0.999963,0.008583
2,2021-10-03,487753.707,1.115543,0.164674,0.612119,-0.783915,-0.160809,276,-0.141327,51,1.063735,-0.385563,1,-0.999668,0.025748
3,2021-10-04,615408.403,0.338327,0.186334,-0.910402,-1.596531,-0.160809,277,0.065976,51,-0.052229,-0.174791,0,-0.999079,0.042905
4,2021-10-05,757789.344,0.097122,0.207995,-1.071319,-1.595248,-0.160809,278,1.931695,63,-0.450788,3.016891,0,-0.998195,0.060049


In [3]:
# Parse date and sort by date (important for time series)
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

In [4]:
# Separate features and target

if dataset == 'dataset/data_v3_stat.csv':
    X = df.drop(columns=['date', 'Residential (SLP)'])
    y = df['Residential (SLP)']
else:
    X = df.drop(columns=['date', 'slp'])
    y = df['slp']

# Define feature types for proper preprocessing
boolean_cols = ['holiday']
categorical_cols = ['weathercode']  # Leave as-is for tree-based models
cyclical_cols = ['day_of_week', 'day_of_year', 'winddirection_10m_dominant']

# All other columns are continuous and should be scaled
continuous_cols = [col for col in X.columns 
                   if col not in boolean_cols + categorical_cols + cyclical_cols]


KeyError: "['Residential (SLP)'] not found in axis"

## 3. Time Series Split

In [None]:
n_split = 10
tscv = TimeSeriesSplit(n_splits = n_split)

In [None]:
def evaluate_model_params(estimator, X, y, tscv):
    """Valuta un singolo estimator (clonato) con TimeSeriesSplit.
    Ritorna dizionario di metriche medie.
    """
    rmse_scores = []
    mae_scores = []
    r2_scores = []
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model = clone(estimator)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))
    return {
        'RMSE_mean': np.mean(rmse_scores),
        'RMSE_std': np.std(rmse_scores),
        'MAE_mean': np.mean(mae_scores),
        'MAE_std': np.std(mae_scores),
        'R2_mean': np.mean(r2_scores),
        'R2_std': np.std(r2_scores),
    }

In [None]:
def param_grid_iter(grid_dict):
    """Genera tutte le combinazioni dalla dict di liste come sklearn.model_selection.ParameterGrid.
    grid_dict: {'param': [v1,v2,...], ...}
    """
    keys = list(grid_dict.keys())
    for values in itertools.product(*(grid_dict[k] for k in keys)):
        yield dict(zip(keys, values))

In [None]:
def top_k_results(results_list, k=5, metric='R2_mean'):
    """Ordina la lista di dict (ognuno con 'params' e metriche) e ritorna top k.
    metric: campo su cui ordinare (default R2_mean decrescente).
    """
    return sorted(results_list, key=lambda r: r.get(metric, -np.inf), reverse=True)[:k]


In [None]:
def make_fine_grid_around(best_params, param_specs, factor=0.5, n_points=5):
    """Crea una lista di param dict per la fine search intorno ai best_params.
    param_specs for each param: {'type':'int'/'float'/'cat', 'bounds':(min,max)}
    factor: estensione percentuale (es 0.5 = +/-50%)
    n_points: quanti punti generare per ogni parametro
    """
    fine_specs = {}
    for p, spec in param_specs.items():
        best = best_params.get(p, None)
        if best is None:
            # se non presente, usa bounds
            lo, hi = spec.get('bounds', (None, None))
            if spec['type'] == 'cat':
                fine_specs[p] = spec['values']
            elif spec['type'] == 'int':
                fine_specs[p] = list(range(
                    max(1, int(lo)),
                    int(hi) + 1,
                    max(1, int((int(hi)-int(lo))//(n_points-1) if n_points>1 else 1))
                ))
            else:
                fine_specs[p] = list(np.linspace(lo, hi, n_points))
            continue

        if spec['type'] == 'cat':
            fine_specs[p] = spec['values']

        elif spec['type'] == 'int':
            lo = max(spec['bounds'][0], int(best - max(1, factor * best)))
            hi = min(spec['bounds'][1], int(best + max(1, factor * best)))
            if lo >= hi:
                fine_specs[p] = [int(best)]
            else:
                fine_specs[p] = sorted(list(set([int(x) for x in np.linspace(lo, hi, n_points)])))

        else:  # float
            lo = max(spec['bounds'][0], best * (1 - factor))
            hi = min(spec['bounds'][1], best * (1 + factor))
            fine_specs[p] = list(np.linspace(lo, hi, n_points))

    combos = list(itertools.islice(param_grid_iter(fine_specs), 10000))
    return combos


# Define Models

In [None]:
models_space = {
    'RandomForest': {
        'estimator': RandomForestRegressor(random_state=42, n_jobs=-1),
        'coarse': {
            'n_estimators': [50, 100, 300],
            'max_depth': [5, 10, 20, None],
            'min_samples_split': [2, 5, 10],
            'max_features': ['sqrt', 'log2', 0.5]
        },
        'specs': {
            'n_estimators': {'type':'int', 'bounds':(10,1000)},
            'max_depth': {'type':'int', 'bounds':(3,50)},
            'min_samples_split': {'type':'int', 'bounds':(2,50)},
            'max_features': {'type':'cat', 'values':['sqrt','log2',0.2,0.3,0.4,0.5,None]}
        }
    },
    'GradientBoosting': {
        'estimator': GradientBoostingRegressor(random_state=42),
        'coarse': {
            'n_estimators': [100, 300, 800],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 5, 8],
            'subsample': [0.6, 0.8, 1.0]
        },
        'specs': {
            'n_estimators': {'type':'int', 'bounds':(50,2000)},
            'learning_rate': {'type':'float', 'bounds':(1e-4,1.0)},
            'max_depth': {'type':'int', 'bounds':(1,20)},
            'subsample': {'type':'float', 'bounds':(0.3,1.0)}
        }
    },
    'LightGBM': {
        'estimator': LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1),
        'coarse': {
            'n_estimators': [100, 300, 1000],
            'learning_rate': [0.01, 0.05, 0.1],
            'num_leaves': [15, 31, 63],
            'max_depth': [-1, 5, 10]
        },
        'specs': {
            'n_estimators': {'type':'int', 'bounds':(50,3000)},
            'learning_rate': {'type':'float', 'bounds':(1e-4,1.0)},
            'num_leaves': {'type':'int', 'bounds':(6,2048)},
            'max_depth': {'type':'int', 'bounds':(-1,50)},
            'min_child_samples': {'type':'int', 'bounds':(1,100)}
        }
    }
}

In [None]:
def evaluate_param_set(estimator, params, X, y, tscv):
    est = clone(estimator).set_params(**params)
    metrics = evaluate_model_params(est, X, y, tscv)
    return {'params': params, **metrics}

In [None]:
def coarse_to_fine_search(name, model_info, X, y, tscv, top_k=5):
    print('\n' + '='*60)
    print(f'Inizio ricerca per: {name}')
    estimator = model_info['estimator']
    coarse_grid = model_info['coarse']
    specs = model_info['specs']

    # ------- COARSE SEARCH PARALLEL -------
    param_list = list(param_grid_iter(coarse_grid))
    print(f'Coarse grid size: {len(param_list)} combinazioni')

    coarse_results = Parallel(n_jobs=-1, verbose=10)(
        delayed(evaluate_param_set)(estimator, params, X, y, tscv)
        for params in param_list
    )

    # top-k
    top_coarse = top_k_results(coarse_results, k=top_k, metric='R2_mean')
    print('\nTop risultati (coarse):')
    for r in top_coarse:
        print(f"  R2={r['R2_mean']:.4f} — params={r['params']}")

    # ------- FINE SEARCH PARALLEL -------
    best_coarse = top_coarse[0]
    best_params = best_coarse['params']

    fine_param_list = make_fine_grid_around(best_params, specs, factor=0.5, n_points=7)
    print(f"\nFine grid size (limitata): {len(fine_param_list)} combinazioni\n")

    fine_results = Parallel(n_jobs=-1, verbose=10)(
        delayed(evaluate_param_set)(estimator, params, X, y, tscv)
        for params in fine_param_list
    )

    top_fine = top_k_results(fine_results, k=3, metric='R2_mean')
    print('\nTop risultati (fine):')
    for r in top_fine:
        print(f"  R2={r['R2_mean']:.4f} — params={r['params']}")

    best_final = top_fine[0]
    return {
        'coarse_results': coarse_results,
        'top_coarse': top_coarse,
        'fine_results': fine_results,
        'top_fine': top_fine,
        'best': best_final
    }


In [None]:
all_best = {}
for name, info in models_space.items():
    res = coarse_to_fine_search(name, info, X, y, tscv, top_k=3)
    best_entry = res['best']
    all_best[name] = best_entry
    # salva i risultati intermedi su disco per controllo laterale
    joblib.dump(res, f'results_{name}_coarse_to_fine.pkl')
    print(f"Risultati salvati in results_{name}_coarse_to_fine.pkl")



Inizio ricerca per: RandomForest
Coarse grid size: 108 combinazioni


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 112 concurrent workers.


[Parallel(n_jobs=-1)]: Done   6 out of 108 | elapsed:    6.0s remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  17 out of 108 | elapsed:    6.5s remaining:   34.6s
[Parallel(n_jobs=-1)]: Done  28 out of 108 | elapsed:    7.0s remaining:   19.9s
[Parallel(n_jobs=-1)]: Done  39 out of 108 | elapsed:    8.1s remaining:   14.2s
[Parallel(n_jobs=-1)]: Done  50 out of 108 | elapsed:    8.4s remaining:    9.8s
[Parallel(n_jobs=-1)]: Done  61 out of 108 | elapsed:    9.3s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done  72 out of 108 | elapsed:    9.8s remaining:    4.9s


KeyboardInterrupt: 

In [None]:
summary = []
for name, best in all_best.items():
    summary.append({
        'model': name,
        'R2_mean': best['R2_mean'],
        'RMSE_mean': best['RMSE_mean'],
        'MAE_mean': best['MAE_mean'],
        'best_params': best['params']
    })
summary_df = pd.DataFrame(summary).sort_values('R2_mean', ascending=False).reset_index(drop=True)
print('\n' + '='*80)
print('CONFRONTO FINALE: modelli ottimizzati')
print(summary_df)

In [None]:
print('\nFit e salvataggio dei modelli finali (su tutto il dataset):')
for idx, row in summary_df.iterrows():
    name = row['model']
    best_params = row['best_params']
    estimator = models_space[name]['estimator'].set_params(**best_params)
    print(f"  Fit model: {name} con params: {best_params}")
    estimator.fit(X, y)
    joblib.dump(estimator, f'best_model_{name}.pkl')
    print(f"  Salvato: best_model_{name}.pkl")

print('\nDONE')

Training and evaluating models...
   holiday  weathercode  temperature_2m_max  temperature_2m_min  \
0        0            3            0.389530            0.248483   
1        0            3            0.520484            0.218630   
2        1           51            0.972872            1.412763   
3        0           51            0.318100            0.382823   
4        0           63           -0.086669            0.532090   

   temperature_2m_mean  apparent_temperature_max  apparent_temperature_min  \
0             0.271326                  0.228208                  0.105172   
1             0.472330                  0.519480                  0.251962   
2             1.115543                  0.911192                  1.022609   
3             0.338327                  0.368822                  0.472147   
4             0.097122                  0.127769                  0.594472   

   apparent_temperature_mean   sunrise    sunset  ...  windspeed_10m_max  \
0                 