# Model Hyperparameter tunning
In this notebook the hyperparameter tunning for each model is performed and visualized

In [2]:
import os
import time
import polars as pl
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
from scipy.stats import randint as sp_randint, uniform as sp_uniform
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV,KFold
from sklearn.metrics import mean_squared_error

dataset_path = os.path.join('..','Data','Final_Dataset','final_dataset.parquet')
dataset = pl.read_parquet(dataset_path)
pollutants_cols = [col for col in dataset.columns if col.startswith('MONTHLY')]

## Feature Selection

In [None]:
feature_cols = [
        "EURO_1", "EURO_2", "EURO_3", "EURO_4", "EURO_5", "EURO_6", "EURO_CLEAN",
        "Previous","TotalFleet"]

context_cols = ["CITY_AREA","POPULATION","Population_density"]    
    
cars_per_surface = [
        "CARS_PER_KM2", "EURO_1_PER_KM2", "EURO_2_PER_KM2","EURO_3_PER_KM2","EURO_4_PER_KM2",
        "EURO_5_PER_KM2","EURO_6_PER_KM2","EURO_CLEAN_PER_KM2","Previous_PER_KM2"]
    
cars_per_capita = [
        "CARS_PER_CAPITA", "EURO_1_PER_CAPITA", "EURO_2_PER_CAPITA",
        "EURO_3_PER_CAPITA","EURO_4_PER_CAPITA","EURO_5_PER_CAPITA","EURO_6_PER_CAPITA",
        "EURO_CLEAN_PER_CAPITA","Previous_PER_CAPITA"]

blocks = {'basic' : feature_cols, 
         'context' : feature_cols+context_cols, 
         'per_capita': feature_cols+context_cols+cars_per_capita,
         'per_capita_non_context': feature_cols+cars_per_capita,
         'per_surface': feature_cols+context_cols+cars_per_surface,
         'per_surface_non_context': feature_cols+cars_per_surface,
         'complete':feature_cols+context_cols+cars_per_surface+cars_per_capita,
         'complete_non_context':feature_cols+cars_per_surface+cars_per_capita}


In [None]:
feature_cols = [
        "EURO_1", "EURO_2", "EURO_3", "EURO_4", "EURO_5", "EURO_6", "EURO_CLEAN",
        "Previous","TotalFleet"]

context_cols = ["CITY_AREA","POPULATION","Population_density"]    
    
cars_per_surface = [
        "CARS_PER_KM2", "EURO_1_PER_KM2", "EURO_2_PER_KM2","EURO_3_PER_KM2","EURO_4_PER_KM2",
        "EURO_5_PER_KM2","EURO_6_PER_KM2","EURO_CLEAN_PER_KM2","Previous_PER_KM2"]
    
cars_per_capita = [
        "CARS_PER_CAPITA", "EURO_1_PER_CAPITA", "EURO_2_PER_CAPITA",
        "EURO_3_PER_CAPITA","EURO_4_PER_CAPITA","EURO_5_PER_CAPITA","EURO_6_PER_CAPITA",
        "EURO_CLEAN_PER_CAPITA","Previous_PER_CAPITA"]

blocks = {'basic' : feature_cols, 
         'context' : feature_cols+context_cols, 
         'per_capita': feature_cols+context_cols+cars_per_capita,
         'per_capita_non_context': feature_cols+cars_per_capita,
         'per_surface': feature_cols+context_cols+cars_per_surface,
         'per_surface_non_context': feature_cols+cars_per_surface,
         'complete':feature_cols+context_cols+cars_per_surface+cars_per_capita,
         'complete_non_context':feature_cols+cars_per_surface+cars_per_capita}

outer_cv = KFold(5, shuffle=True, random_state=42)

dataset_path = os.path.join('..','Data','Final_Dataset','final_dataset.parquet')
results_list = []

dataset = pl.read_parquet(dataset_path)
pollutants_cols = [col for col in dataset.columns if col.startswith('MONTHLY')]
raw = dataset.to_pandas()

for pollutant_col in pollutants_cols:
    pollutant = pollutant_col.split('_')[0][7:]
    raw_aux = raw[raw[pollutant_col].notna()]
    for name, cols in blocks.items():
        print(f'Tunning the pollutant {pollutant} with the {name} block')
        X = raw_aux[cols]
        y = raw_aux[pollutant_col]

        est = LGBMRegressor(
            objective='regression',
            metric='rmse',
            random_state=42,
            n_jobs=1,
            verbosity=-1
        )
    
        param_dist = {
            'num_leaves':      sp_randint(2, 66),
            'max_depth':       sp_randint(3, 51),
            'learning_rate':   sp_uniform(0.01, 0.14),
            'n_estimators':    sp_randint(100, 601),
            'min_child_samples': sp_randint(1, 51),
            'boosting_type':   ['gbdt']
            }

        search = RandomizedSearchCV(
            est, param_dist,
            n_iter=200,
            cv=outer_cv,
            scoring='neg_root_mean_squared_error',
            random_state=1,
            n_jobs=-1,
            verbose=1
        )

        start = time.time()
        search.fit(X, y)
        
        elapsed = time.time() - start
        best_rmse = -search.best_score_
        best_params = search.best_params_
        
        results_list.append({
            'pollutant': pollutant,
            'block': name,
            'best_rmse': best_rmse,
            'best_params': best_params,
            'duration_s': elapsed
        })
        
        print(f"[{pollutant} - {name}] RMSE = {best_rmse:.5f}, elapsed time = {elapsed:.0f}s")

df_results = pd.DataFrame(results_list)
output_path = os.path.join('..','Models','Results','1st_stage.csv')
df_results.to_csv(output_path, index=False)

Tunning the pollutant CO with the per_capita_non_context block
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CO - per_capita_non_context] RMSE = 0.08705, elapsed time = 447s
Tunning the pollutant CO with the per_surface_non_context block
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CO - per_surface_non_context] RMSE = 0.08932, elapsed time = 389s
Tunning the pollutant CO with the complete_non_context block
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CO - complete_non_context] RMSE = 0.08774, elapsed time = 566s
Tunning the pollutant NO2 with the per_capita_non_context block
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[NO2 - per_capita_non_context] RMSE = 4.49534, elapsed time = 412s
Tunning the pollutant NO2 with the per_surface_non_context block
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[NO2 - per_surface_non_context] RMSE = 4.58809, elapsed time = 388s
Tunning the pollutant NO2 with 

In [51]:
output_path = os.path.join('..','Models','Results','1st_stage2.csv')
feature_selections = pl.read_csv(output_path)
output_path = os.path.join('..','Models','Results','1st_stagee.csv')
feature_selections2 = pl.concat([feature_selections,pl.read_csv(output_path)])

In [1]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

output_path = os.path.join('..','Models','Results','1st_stage.csv')
feature_selections = pl.read_csv(output_path)

scored = (
    feature_selections.with_columns(
        (100 * (1 - (pl.col("best_rmse") - pl.col("best_rmse").min().over("pollutant"))
                         / pl.col("best_rmse").min().over("pollutant")))
        .alias("score")
    )
)

heat = (
    scored.pivot(values="score", index="pollutant", columns="block").sort("pollutant")
)

matrix = heat.drop("pollutant").to_numpy()

vmin, vmax = 90, 100
cmap = plt.get_cmap("Blues").copy()
cmap.set_under("lightgrey")
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)

fig, ax = plt.subplots(figsize=(matrix.shape[1]*1.4,
                                matrix.shape[0]*0.6 + 2))
im = ax.imshow(matrix, cmap=cmap, norm=norm)

ax.set_xticks(np.arange(matrix.shape[1]),
              labels=heat.columns[1:], rotation=45, ha="right")
ax.set_yticks(np.arange(matrix.shape[0]),
              labels=heat["pollutant"])
ax.set_xlabel("Feature block")
ax.set_ylabel("Pollutant")
ax.set_title("Percent improvement over best feature-block combiation \n(colour scale 80–100)")

for i, pollutant in enumerate(heat["pollutant"]):
    for j, block in enumerate(heat.columns[1:]):
        rmse = (
            feature_selections
            .filter((pl.col("pollutant") == pollutant) &
                    (pl.col("block") == block))
            .select("best_rmse")
            .item()
        )
        ax.text(j, i, f"{rmse:.3g}", ha="center", va="center",
                color="black", fontsize=8)
fig.colorbar(im, ax=ax, shrink=0.8, label="% of best (90–100)")
plt.tight_layout()
output_path = os.path.join('..','Figures','Model_feature_selec2.png')
plt.savefig(output_path)
plt.show()

NameError: name 'os' is not defined

## Hyperparameter tunning

In [40]:
feature_selections.filter(pl.col('block') == 'per_capita')['best_params'].to_list()
pollutants = feature_selections['pollutant'].unique().to_list()
for pollutant in pollutants:
    params = feature_selections.filter((pl.col('pollutant') == pollutant) &
                                 (pl.col('block') == 'per_capita'))['best_params'].item()
    print(f'For the pollutant {pollutant} the best parameters where:')
    print(f'{params}')

For the pollutant CO the best parameters where:
{'boosting_type': 'gbdt', 'learning_rate': np.float64(0.09908068544826885), 'max_depth': 32, 'min_child_samples': 12, 'n_estimators': 545, 'num_leaves': 61}
For the pollutant O3 the best parameters where:
{'boosting_type': 'gbdt', 'learning_rate': np.float64(0.1148499725344171), 'max_depth': 42, 'min_child_samples': 4, 'n_estimators': 541, 'num_leaves': 45}
For the pollutant NO2 the best parameters where:
{'boosting_type': 'gbdt', 'learning_rate': np.float64(0.09912461600026562), 'max_depth': 42, 'min_child_samples': 7, 'n_estimators': 576, 'num_leaves': 54}
For the pollutant PM10 the best parameters where:
{'boosting_type': 'gbdt', 'learning_rate': np.float64(0.03607642959287393), 'max_depth': 48, 'min_child_samples': 7, 'n_estimators': 381, 'num_leaves': 52}
For the pollutant PM25 the best parameters where:
{'boosting_type': 'gbdt', 'learning_rate': np.float64(0.09952452332319162), 'max_depth': 19, 'min_child_samples': 20, 'n_estimators

In [None]:
outer_cv = KFold(5, shuffle=True, random_state=42)

dataset_path = os.path.join('..','Data','Final_Dataset','final_dataset.parquet')
results_list = []

dataset = pl.read_parquet(dataset_path)
pollutants_cols = [col for col in dataset.columns if col.startswith('MONTHLY')]
raw = dataset.to_pandas()

param_dist = {
    'num_leaves':      sp_randint(42, 76),
    'max_depth':       sp_randint(19, 56),
    'learning_rate':   sp_uniform(0.03, 0.15),
    'n_estimators':    sp_randint(350, 601),
    'min_child_samples': sp_randint(1, 26),
    'boosting_type':   ['gbdt']
    }

cols = [
        "EURO_1", "EURO_2", "EURO_3", "EURO_4", "EURO_5", "EURO_6", "EURO_CLEAN",
        "Previous","TotalFleet","CARS_PER_CAPITA", "EURO_1_PER_CAPITA", "EURO_2_PER_CAPITA",
        "EURO_3_PER_CAPITA","EURO_4_PER_CAPITA","EURO_5_PER_CAPITA","EURO_6_PER_CAPITA",
        "EURO_CLEAN_PER_CAPITA","Previous_PER_CAPITA"]

for pollutant_col in pollutants_cols:
    pollutant = pollutant_col.split('_')[0][7:]
    raw_aux = raw[raw[pollutant_col].notna()]
    print(f'Tunning the pollutant {pollutant} with the {name} block')
    X = raw_aux[cols]
    y = raw_aux[pollutant_col]

    est = LGBMRegressor(
        objective='regression',
        metric='rmse',
        random_state=42,
        n_jobs=1,
        verbosity=-1
    )

    param_dist = param_dist

    search = RandomizedSearchCV(
        est, param_dist,
        n_iter=500,
        cv=outer_cv,
        scoring='neg_root_mean_squared_error',
        random_state=1,
        n_jobs=-1,
        verbose=1
        )

    start = time.time()
    search.fit(X, y)

    elapsed = time.time() - start
    best_rmse = -search.best_score_
    best_params = search.best_params_

    results_list.append({
        'pollutant': pollutant,
        'best_rmse': best_rmse,
        'best_params': best_params,
        'duration_s': elapsed
    })

    print(f"[{pollutant}] RMSE = {best_rmse:.5f}, elapsed time = {elapsed:.0f}s")

df_results = pd.DataFrame(results_list)
output_path = os.path.join('..','Models','Results','2nd_stage.csv')
df_results.to_csv(output_path, index=False)

Tunning the pollutant CO with the complete_non_context block
Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CO] RMSE = 0.08699, elapsed time = 2413s
Tunning the pollutant NO2 with the complete_non_context block
Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[NO2] RMSE = 4.47082, elapsed time = 2385s
Tunning the pollutant O3 with the complete_non_context block
Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[O3] RMSE = 8.29524, elapsed time = 1826s
Tunning the pollutant PM10 with the complete_non_context block
Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[PM10] RMSE = 5.27250, elapsed time = 2233s
Tunning the pollutant PM25 with the complete_non_context block
Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[PM25] RMSE = 2.74320, elapsed time = 1840s


## Training of the models

In [2]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error

dataset = pd.read_parquet(os.path.join('..','Data','Final_Dataset','final_dataset.parquet'))

def parse_params(s:str)->dict:
    return eval(s,{"__builtins__":None},{"np":np})

hyper_params = pd.read_csv(
    os.path.join('..','Models','Results','2nd_stage.csv'),
    converters={"best_params": parse_params}
)

pollutants      = hyper_params['pollutant'].tolist()
pollutant_cols  = [c for c in dataset.columns if c.startswith('MONTHLY')]
feature_cols    = [
    "EURO_1","EURO_2","EURO_3","EURO_4","EURO_5","EURO_6","EURO_CLEAN",
    "Previous","TotalFleet","CARS_PER_CAPITA","EURO_1_PER_CAPITA",
    "EURO_2_PER_CAPITA","EURO_3_PER_CAPITA","EURO_4_PER_CAPITA",
    "EURO_5_PER_CAPITA","EURO_6_PER_CAPITA","EURO_CLEAN_PER_CAPITA",
    "Previous_PER_CAPITA"
]

for pollutant in pollutants:
    pollutant_col = next((c for c in pollutant_cols if pollutant in c), None)
    if pollutant_col is None:
        continue

    aux = dataset[dataset[pollutant_col].notna()]
    X, y = aux[feature_cols], aux[pollutant_col]

    best_params = hyper_params.loc[
        hyper_params['pollutant']==pollutant,'best_params'
    ].iloc[0]
    best_params['verbosity'] = -1

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_val   = lgb.Dataset(X_test,  label=y_test, reference=lgb_train)

    model = lgb.train(
        best_params,
        lgb_train,
        valid_sets=[lgb_val],
        num_boost_round=10_000,
    )

    model_path = os.path.join('..', 'Models',f'{pollutant}_model.txt')
    model.save_model(model_path)

    y_pred = model.predict(X_test)
    rmse   = np.sqrt(mean_squared_error(y_test, y_pred))
    rel_rmse = rmse / y.mean() * 100
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    print(f"LightGBM RMSE for {pollutant}: {rmse:.4f} and a MAPE of {mape:.2f}%")

LightGBM RMSE for CO: 0.0846 and a MAPE of 19.67%
LightGBM RMSE for NO2: 4.2901 and a MAPE of 17.47%
LightGBM RMSE for O3: 8.6618 and a MAPE of 17.22%
LightGBM RMSE for PM10: 4.9781 and a MAPE of 20.52%
LightGBM RMSE for PM25: 2.5077 and a MAPE of 20.68%


In [16]:
import os
import ast
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# ── 0. paths ────────────────────────────────────────────────────────────────
DATA_PATH   = os.path.join('..', 'Data', 'Final_Dataset', 'final_dataset.parquet')
PARAMS_CSV  = os.path.join('..', 'Models', 'Results', '2nd_stage.csv')
MODELS_DIR  = os.path.join('..', 'Models')
PLOTS_DIR   = os.path.join('..', 'Figures')
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

# ── 1. load data & hyper-parameters ─────────────────────────────────────────
dataset = pd.read_parquet(DATA_PATH)

def parse_params(s:str)->dict:
    return eval(s,{"__builtins__":None},{"np":np})

hyper_params = pd.read_csv(
    PARAMS_CSV,
    converters={'best_params': parse_params}
)

pollutants     = hyper_params['pollutant'].tolist()
pollutant_cols = [c for c in dataset.columns if c.startswith('MONTHLY')]

feature_cols = [
    "EURO_1","EURO_2","EURO_3","EURO_4","EURO_5","EURO_6","EURO_CLEAN",
    "Previous","TotalFleet","CARS_PER_CAPITA","EURO_1_PER_CAPITA",
    "EURO_2_PER_CAPITA","EURO_3_PER_CAPITA","EURO_4_PER_CAPITA",
    "EURO_5_PER_CAPITA","EURO_6_PER_CAPITA","EURO_CLEAN_PER_CAPITA",
    "Previous_PER_CAPITA"
]

# ensure datetime
dataset['date'] = pd.to_datetime(dataset['date'])

# ── 2. time-based splits ───────────────────────────────────────────────────
mask_dev  = dataset['date'] <  '2022-01-01'   # up to end-2021 for dev
mask_test = dataset['date'] >= '2022-01-01'   # 2022+ for hold-out

dev_df  = dataset.loc[mask_dev].copy()
test_df = dataset.loc[mask_test].copy()

split_idx = int(len(dev_df) * 0.9)
train_df  = dev_df.iloc[:split_idx]
val_df    = dev_df.iloc[split_idx:]

# ── 3. training loop ───────────────────────────────────────────────────────
for pollutant in pollutants:
    pollutant_col = next((c for c in pollutant_cols if pollutant in c), None)
    if pollutant_col is None:
        continue

    # filter out NA target rows
    train_rows = train_df[train_df[pollutant_col].notna()]
    val_rows   = val_df  [val_df  [pollutant_col].notna()]
    test_rows  = test_df [test_df [pollutant_col].notna()]

    X_train, y_train = train_rows[feature_cols], train_rows[pollutant_col]
    X_val,   y_val   = val_rows  [feature_cols], val_rows  [pollutant_col]
    X_test,  y_test  = test_rows [feature_cols], test_rows [pollutant_col]

    lgb_train = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
    lgb_val   = lgb.Dataset(X_val,   label=y_val,   free_raw_data=False)

    best_params = (
        hyper_params.loc[hyper_params['pollutant'] == pollutant, 'best_params']
        .iloc[0]
    )
    best_params.update(dict(verbosity=-1, metric='l2'))

    evals_result = {}

    model = lgb.train(
        best_params,
        lgb_train,
        num_boost_round=10_000,
        valid_sets=[lgb_train, lgb_val],
        valid_names=['train', 'val'],
        callbacks=[
            lgb.early_stopping(200),
            lgb.log_evaluation(period=0),
            lgb.record_evaluation(evals_result)
        ]
    )

    # ── 4. final test-set evaluation ────────────────────────────────────────
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    rmse   = np.sqrt(mean_squared_error(y_test, y_pred))
    rel_rmse = rmse / y_test.mean() * 100
    mape   = mean_absolute_percentage_error(y_test, y_pred) * 100

    print(f"{pollutant:5s} | iter={model.best_iteration:4d} | "
          f"RMSE={rmse:.3f} | relRMSE={rel_rmse:.2f}% | MAPE={mape:.2f}%")

    # ── 5. plot learning curves ────────────────────────────────────────────
    train_loss = np.sqrt(evals_result['train']['l2'])
    val_loss   = np.sqrt(evals_result['val'  ]['l2'])

    plt.figure(figsize=(8, 4))
    plt.plot(train_loss, label='Train')
    plt.plot(val_loss,   label='Validation')
    plt.axvline(model.best_iteration, ls='--', c='k', label='Early-stop')
    plt.title(f"{pollutant.upper()} – Learning curves")
    plt.xlabel("Iteration")
    plt.ylabel("RMSE")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(PLOTS_DIR, f'{pollutant}_learning_curve.png'), dpi=200)
    plt.close()

    # ── 6. save model ──────────────────────────────────────────────────────
    model.save_model(os.path.join(MODELS_DIR, f'{pollutant}_model.txt'),
                     num_iteration=model.best_iteration)

print("\n✓ All pollutant models trained, evaluated, and saved.")


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	train's l2: 0.0289474	val's l2: 0.0338697
CO    | iter=   1 | RMSE=0.215 | relRMSE=66.37% | MAPE=79.89%
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[11]	train's l2: 48.3064	val's l2: 70.0482
NO2   | iter=  11 | RMSE=7.137 | relRMSE=39.62% | MAPE=51.22%
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[10]	train's l2: 99.1312	val's l2: 199.692
O3    | iter=  10 | RMSE=15.705 | relRMSE=31.57% | MAPE=28.09%
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	train's l2: 52.9622	val's l2: 48.4243
PM10  | iter=   1 | RMSE=9.292 | relRMSE=42.61% | MAPE=35.05%
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[5]	train's l2: 11.6318	val's l2: 15.4444
PM25  | iter=   5 | RMSE=3.734 | relRMSE=35.71% | 