# **Baseline Simple MLP with just MSE**


## **Running the models using the 'modelling' package**

A notebook through which different modelling configurations can be ran, using the ``modelling`` package. It follows the steps of:
- preparing packages;
- setting "global" variables;
- getting the data;
- defining hyperparameters;
- running a Optuna hyperparameters optimisation and/or training a model; and
- evaluation.
In the modelling package, variations can be made to the models and training functions to experiment. Don't forget to restart the notebook after making changes there.

## **IMPORTANT NOTE**: 
- do preprocessing from ``preprocess.ipynb`` to obtain data in ``data/data_combined``, before starting this notebook
- make sure the notebook is under ``src`` directory before running!
- change the global variables defined below for the desired years of data, loss function and NN type



In [1]:
print("Starting script...")


from modelling.MLP import BasicMLP
from modelling import *


import optuna
import threading
import os
from pathlib import Path
import datetime
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import ConcatDataset

Starting script...

Running __init__.py for data pipeline...
Modelling package initialized



Use GPU when available

In [2]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print("Device: ", device)

Device:  cpu


### **Set "global" variables**

In [3]:
Path.cwd()

PosixPath('/home/rachel/forecasting_smog_PEML/src')

In [4]:
import importlib
import config
importlib.reload(config)

<module 'config' from '/home/rachel/forecasting_smog_PEML/src/config.py'>

In [5]:
from config import *

In [6]:
HABROK = bool(0)                  # set to True if using HABROK; it will print
                                  # all stdout to a .txt file to log progress



## MODIFY THESE GLOBAL VARIABLES FOR YOUR MODEL SCENARIO
all other variables are defined in config.py

LOSS_FUNC: choose from 
- MSE
- LinearShift_MSE
- PDE_nmer_const
- PDE_nmer_piece
- PINN

CITY: choose from
- Utrecht
- Amsterdam (for testing transferability)
- Multi (extended area around utrecht)

In [7]:
# Change this according to the data you want to use
YEARS = [2017, 2018, 2020, 2021, 2022, 2023]
TRAIN_YEARS = [2017, 2018, 2020, 2021, 2022]
VAL_YEARS = [2021, 2022, 2023]
TEST_YEARS = [2021, 2022, 2023]

# for loss.py and also naming of file names
LOSS_FUNC = "MSE" # choose from the above list
NN_TYPE = "MLP" 
CITY = 'Utrecht' 

## Automated Generation of paths and filenames according to data years, loss func, NN type
- will be used throughout the whole notebook
- check ``config.py`` for global variables defined outside the notebook

In [8]:
years, idx_dict , station_names, main_station, RESULTS_PATH, MODEL_PATH, DATASET_PATH, MINMAX_PATH, Y_PHY_FILENAME,  MODEL_PATH_NAME,RESULTS_METRICS_FILENAME, BESTPARAMS_FILENAME, PLOT_FILENAME  = init_paths(CITY, YEARS, LOSS_FUNC, NN_TYPE)
print("years: ", years)
print("idx_dict: ", idx_dict)
print("station_names: ", station_names)
print("main_station: ", main_station)
print("RESULTS_PATH: ", RESULTS_PATH)
print("MODEL_PATH: ", MODEL_PATH)
print("MINMAX_PATH: ", MINMAX_PATH)
print("DATASET_PATH: ", DATASET_PATH)
print("Y_PHY_FILENAME: ", Y_PHY_FILENAME)
print("MODEL_PATH_NAME: ", MODEL_PATH_NAME)
print("RESULTS_METRICS_FILENAME: ", RESULTS_METRICS_FILENAME)
print("BESTPARAMS_FILENAME: ", BESTPARAMS_FILENAME)
print("PLOT_FILENAME: ", PLOT_FILENAME)

years:  allyears
idx_dict:  {'NO2_TUINDORP_IDX': 5, 'NO2_BREUKELEN_IDX': 4, 'WIND_DIR_IDX': 0, 'WIND_SPEED_IDX': 2}
station_names:  ['tuindorp', 'breukelen']
main_station:  breukelen
RESULTS_PATH:  /home/rachel/forecasting_smog_PEML/src/results/Utrecht
MODEL_PATH:  /home/rachel/forecasting_smog_PEML/src/results/Utrecht/models
MINMAX_PATH:  /home/rachel/forecasting_smog_PEML/data/data_combined/Utrecht/all_years/pollutants_minmax_allyears.csv
DATASET_PATH:  /home/rachel/forecasting_smog_PEML/data/data_combined/Utrecht/all_years
Y_PHY_FILENAME:  y_phy_batchsize16_MSE_allyears_Utrecht
MODEL_PATH_NAME:  best_MLP_no2_MSE_allyears_Utrecht.pth
RESULTS_METRICS_FILENAME:  results_MLP_no2_MSE_allyears_Utrecht.csv
BESTPARAMS_FILENAME:  best_params_MLP_no2_MSE_allyears_Utrecht.txt
PLOT_FILENAME:  plot_MLP_no2_MSE_allyears_Utrecht.png


### **Load in data and create PyTorch *Datasets***

In [9]:
# Load in data and create PyTorch Datasets. To tune
# which exact .csv files get extracted, change the
# lists in the get_dataframes() definition

train_input_frames = get_dataframes('train', 'u', YEARS, DATASET_PATH)
train_output_frames = get_dataframes('train', 'y', YEARS, DATASET_PATH)

val_input_frames = get_dataframes('val', 'u', YEARS, DATASET_PATH)
val_output_frames = get_dataframes('val', 'y', YEARS, DATASET_PATH)

test_input_frames = get_dataframes('test', 'u', YEARS, DATASET_PATH)
test_output_frames = get_dataframes('test', 'y', YEARS, DATASET_PATH)

print("Successfully loaded data")

Imported train_2017_combined_u.csv


Imported train_2018_combined_u.csv
Imported train_2020_combined_u.csv
Imported train_2021_combined_u.csv
Imported train_2022_combined_u.csv
Imported train_2017_combined_y.csv
Imported train_2018_combined_y.csv
Imported train_2020_combined_y.csv
Imported train_2021_combined_y.csv
Imported train_2022_combined_y.csv
Imported val_2021_combined_u.csv
Imported val_2022_combined_u.csv
Imported val_2023_combined_u.csv
Imported val_2021_combined_y.csv
Imported val_2022_combined_y.csv
Imported val_2023_combined_y.csv
Imported test_2021_combined_u.csv
Imported test_2022_combined_u.csv
Imported test_2023_combined_u.csv
Imported test_2021_combined_y.csv
Imported test_2022_combined_y.csv
Imported test_2023_combined_y.csv
Successfully loaded data


In [10]:
train_dataset = TimeSeriesDataset(
    train_input_frames,  # list of input training dataframes
    train_output_frames, # list of output training dataframes
    len(TRAIN_YEARS),                   # number of dataframes put in for both
                         # (basically len(train_input_frames) and
                         # len(train_output_frames) must be equal)
    N_HOURS_U,           # number of hours of input data
    N_HOURS_Y,           # number of hours of output data
    N_HOURS_STEP,        # number of hours between each input/output pair
)
val_dataset = TimeSeriesDataset(
    val_input_frames,    # etc.
    val_output_frames,
    len(VAL_YEARS),
    N_HOURS_U,
    N_HOURS_Y,
    N_HOURS_STEP,
)
test_dataset = TimeSeriesDataset(
    test_input_frames,
    test_output_frames,
    len(TEST_YEARS),
    N_HOURS_U,
    N_HOURS_Y,
    N_HOURS_STEP,
)

del train_input_frames, train_output_frames
del val_input_frames, val_output_frames
del test_input_frames, test_output_frames

## Confirmation that the dataset has column indexes the same as those in ``config.py``
Indexes are used mainly for the physics calculations, in order to accurately extract the information needed

In [11]:
column_names = list(train_dataset.u[0])  # Convert Index to list
check_station_indexes(column_names, idx_dict)

NO2_TUINDORP_IDX index matches in index: 5
NO2_BREUKELEN_IDX index matches in index: 4
WIND_DIR_IDX index matches in index: 0
WIND_SPEED_IDX index matches in index: 2
All station indexes match.


True

In [12]:
import random
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [13]:
import json
file_path = f"{RESULTS_PATH}/best_params/{BESTPARAMS_FILENAME}"

## Read params from file

In [14]:

with open(file_path, "r") as f:
    best_params = json.load(f)  # Automatically converts it to a dictionary

print(f"Loading best parms from {file_path}")
print("Loaded Best Parameters:", best_params)

Loading best parms from /home/rachel/forecasting_smog_PEML/src/results/Utrecht/best_params/best_params_MLP_no2_MSE_allyears_Utrecht.txt
Loaded Best Parameters: {'n_hidden_layers': 2, 'n_hidden_units': 256, 'lr': 8.394595694372765e-05, 'weight_decay': 3.300707449214965e-07, 'batch_size': 16}


## Training and Saving Model
Model saved in ``src/results/models/best_MLP_no2_MSE_allyears.pth``

In [16]:
set_seed(42)
# Train the model with the best hyperparameters
best_model_baseline = BasicMLP(
    N_INPUT_UNITS=train_dataset.__n_features_in__(),
    N_HIDDEN_LAYERS=best_params["n_hidden_layers"],
    N_HIDDEN_UNITS=best_params["n_hidden_units"],
    N_OUTPUT_UNITS=train_dataset.__n_features_out__(),
    loss_function="MSE",
)

# Create train & validation loaders with the best batch size
train_loader = DataLoader(train_dataset, batch_size=best_params["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=best_params["batch_size"], shuffle=False)


## Plot Train-Val
Plot saved in ``src/results/trainval_plots/trainval_plot_MLP_no2_MSE_allyears.png``


## Test and Save Results
Results saved in ``src/results/metrics/results_MLP_no2_MSE_allyears.csv``

In [17]:
best_model_baseline.load_state_dict(torch.load(f"{MODEL_PATH}/{MODEL_PATH_NAME}", map_location = device))
print(f"Loading best model of {NN_TYPE} {LOSS_FUNC} {years} from {MODEL_PATH}/{MODEL_PATH_NAME}")
best_model_baseline.eval()

# Create the DataLoader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=best_params["batch_size"], shuffle=False)

# Evaluate the model on the test dataset
df_minmax = pd.read_csv(MINMAX_PATH, sep=';')
min_value = df_minmax["min"].values
max_value = df_minmax["max"].values
mse, rmse, smape, inference_time_mean, inference_time_std = best_model_baseline.test_model(test_loader, min_value=min_value, max_value=max_value, device="cpu")



Loading best model of MLP MSE allyears from /home/rachel/forecasting_smog_PEML/src/results/Utrecht/models/best_MLP_no2_MSE_allyears_Utrecht.pth
Test MSE Loss: 49.193053
Test RMSE Loss: 7.013776
Test SMAPE Loss: 28.573230%
Mean Inference Time per Forward Pass: 0.017123 s ± 0.009056 s


In [18]:
import csv

# Define the CSV file path
results_csv_path = f"{RESULTS_PATH}/metrics/{RESULTS_METRICS_FILENAME}"

# Read original header and row
with open(results_csv_path, mode="r") as f:
    reader = csv.reader(f)
    original_header = next(reader)
    original_row = next(reader)

# Convert to dict for easy manipulation
data = dict(zip(original_header, original_row))

# Update inference time
data["Inference Time"] = str(inference_time_mean)

# Insert Inference Time Std right after Inference Time
new_header = []
new_row = []

for col in original_header:
    new_header.append(col)
    new_row.append(data[col])
    if col == "Inference Time":
        new_header.append("Inference Time Std")
        new_row.append(str(inference_time_std))

# Write updated CSV
with open(results_csv_path, mode="w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(new_header)
    writer.writerow(new_row)


print(f"Results saved as {RESULTS_METRICS_FILENAME} in Results/metrics folder")

Results saved as results_MLP_no2_MSE_allyears_Utrecht.csv in Results/metrics folder
