# Fine-Tuning on Cluster TSD Season
Notebook to perform fine-tuning on the best architectur for the TSD Season Cluster: trhougth time-series-decomposition, the season inside the cluster is extracted and used to make prediction.<br>
Data read are from table SLIDING_WINDOWS_DATASET that contains a sliding windows of:
- feats about last 7-days meteo values
- pollen value for the next day

Cluster associations are read from a local file: we have different cluster annotations made by different techniques.<br>
We explore different model & hyper-parameters throught Comet ML.

<h3>Import</h3>

In [2]:
from tqdm.auto import tqdm
import json
import math
import pandas as pd
import numpy as np
import datetime
from datetime import timedelta
from google.cloud import bigquery

import seaborn as sns
from collections import Counter
import matplotlib.cm as cm
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
from matplotlib.pyplot import get_cmap
from matplotlib import cm
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import ipywidgets as widgets
import scipy.stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_squared_error
from plotly.offline import init_notebook_mode, iplot

from comet_ml import Experiment, Optimizer

import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers.experimental import Adam, AdamW, Adadelta
from statsmodels.tsa.seasonal import seasonal_decompose

import warnings
warnings.filterwarnings('ignore')

my_cmap = plt.get_cmap("Paired")
init_notebook_mode(connected=True)  
tqdm.pandas()

2023-04-07 13:03:39.636243: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<h3>Config</h3>

In [3]:
# Config

PROJECT_ID = 'arpae-prod-ml'

# BigQuery
BQ_DATASET = 'SAMPLE_DATA'
JOINED_BQ_DATASET = 'JOINED_DATA'

# Const
COMMON_PERIOD_INIT = '2011-01-01'
COMMON_PERIOD_END = '2021-12-31' 

TRAIN_END = '2016-12-31 00:00:00+00:00'
VAL_END = '2019-12-31 00:00:00+00:00'
TEST_END = '2022-12-31 00:00:00+00:00'

# Cols
DATE_COL = 'date'

# Feats
METEO_FEATS = ['week_amax', 
               'station_lat_amax', 'station_lon_amax', 'station_H_piano_strada_amax', 'station_H_mslm_amax', 
               'B13011_min_amin', 'B13011_max_amax', 'B13011_mean_mean', 'B13011_std_mean', 'B13011_sum_sum', 
               'B14198_min_amin', 'B14198_max_amax', 'B14198_mean_mean', 'B14198_std_mean', 'B14198_sum_sum',
               'TEMP_min_amin', 'TEMP_max_amax', 'TEMP_mean_mean', 'TEMP_std_mean', 'TEMP_sum_sum',                                               
               'PREC_amin', 'PREC_mean', 'PREC_std', 'PREC_median', 'PREC_amax', 'PREC_skew', 'PREC_kurtosis']
POLLEN_FEATS = ['seasonal_mean', 'seasonal_prev_1', # seasonal, trend, residual
                'pol_value_amin', 'pol_value_mean', 'pol_value_std', 'pol_value_median', 'pol_value_amax', 
                'pol_value_skew', 'pol_value_kurtosis',
                'pol_value_prev_1', 'pol_value_prev_2', 'pol_value_prev_3',
                'pol_value_prev_4', 'pol_value_prev_5', 'pol_value_prev_6',
                'pol_value_prev_7']
ORIGINAL_FEATS = METEO_FEATS + POLLEN_FEATS
LABEL_COL = 'season_label' # season, trend, residual

# Params
EPOCHS = 50

# Comet Params
COMET_API_KEY = 'B4Tttbbx4JrwXD9x2HBNjCdXX'
COMET_WORKSPACE = 'pveronesi' 
COMET_PROJECT_NAME = 'arpae-tsd-season-finetuning' # season, trend, residual

# Layout
COLOR_PALETTE = px.colors.qualitative.Prism

OUTPUT_CLUSTER_FILENAME = "../../data/clustering_season_intervals.csv" # season, trend, residual


<h3>Methods</h3>

In [4]:
# Read Methods

def _run_query(client, query): 
    df = client.query(query).to_dataframe()
    return df

def _read_table(client, project_id, dataset, table):
    query = "SELECT * FROM `{}.{}.{}` ".format(project_id, dataset, table)
    df = _run_query(client, query)
    return df

def _read_table_delta(client, project_id, dataset, table, date_col, init, end):
    query = "SELECT * FROM `{}.{}.{}` WHERE {} > '{}' AND {} < '{}' ".format(project_id, dataset, table, date_col, init, date_col, end)
    df = _run_query(client, query)
    if 'reftime' in df.columns:
        df.sort_values(by='reftime', inplace=True)
    elif date_col in df.columns:
        df.sort_values(by=date_col, inplace=True)
    else:
        return None
    return df


In [5]:
# Comet methods

def _create_experiment(api_key, workspace, project_name):
    experiment = Experiment(
        # user config
        api_key=api_key,
        workspace=workspace,  
        # project config
        project_name=project_name,
        # logging config
        log_code=True,
        log_graph=True,
        auto_param_logging=True,
        auto_metric_logging=True,    
        auto_histogram_weight_logging=True,
        auto_histogram_gradient_logging=True,
        auto_histogram_activation_logging=True
    )
    return experiment

In [6]:
# Process Methods

def _create_cluster_widget(clusters):
    cluster_wdgt = widgets.Dropdown(options=clusters, description='Cluster id:', layout={"width":"50%"})
    return cluster_wdgt

def _normalize(x, range_dict, index_col, label_col):
    pol_min = range_dict[x[index_col]]['min']
    pol_max = range_dict[x[index_col]]['max']
    return (x[label_col] - pol_min) / (pol_max - pol_min)

In [7]:
# Data Methods

def _get_data(data_df, clusters_df, cluster_id, feats_cols, date_col, label_col):
    # filter data
    filt_clusters_df = clusters_df[clusters_df['cluster']==cluster_id][['station_id', 'pol_var_id']]
    dataset_df = pd.merge(data_df, filt_clusters_df, how='right', on=['station_id', 'pol_var_id'])
    
    # Create dataset
    dataset_df.sort_values(['station_id', 'pol_var_id', date_col], inplace=True)
    dataset_df = dataset_df[['station_id', 'pol_var_id', date_col]  + feats_cols + [label_col]]
    
    # Set index and drop nan
    dataset_df.set_index(date_col, inplace=True)
    dataset_df.dropna(inplace=True)
    
    print("Rows: {}".format(dataset_df.shape[0]))
    return dataset_df

def _prepare_data(dataset_df, original_feats, meteo_feats, pollen_feats, label_col):
    # Add 1-hot encoding cols
    stations_one_hot = pd.get_dummies(dataset_df['station_id'], prefix='station_id')
    pollen_one_hot = pd.get_dummies(dataset_df['pol_var_id'], prefix='pol_var_id')
    dataset_df = pd.concat([dataset_df, stations_one_hot], axis=1)
    dataset_df = pd.concat([dataset_df, pollen_one_hot], axis=1)
    
    # Update Features
    feats = original_feats + stations_one_hot.columns.values.tolist() + pollen_one_hot.columns.values.tolist()
    n_feats = len(feats)
    
    # Normalize all cols except for pollen ones; Save max cols to restore original values
    cols_max = []
    for col in meteo_feats:
        scaler = MinMaxScaler()
        dataset_df[col] = pd.DataFrame(scaler.fit_transform(dataset_df[[col]])).values
        cols_max.append(int(scaler.data_max_))

    # Normalize with Min-Max scaling each pollen feats 
    for col in tqdm(pollen_feats):
        range_df = dataset_df[['pol_var_id', col]].groupby('pol_var_id').agg(['min', 'max'])
        ranges = {}
        for index, values in zip(range_df.index, range_df.values):
            ranges[index] = {'min': values[0], 'max': values[1]}        
        dataset_df[col] = dataset_df.apply(lambda x: _normalize(x, ranges, 'pol_var_id', col), axis=1)    
    
    # Normalize with Min-Max scaling the label col
    range_df = dataset_df[['pol_var_id', LABEL_COL]].groupby('pol_var_id').agg(['min', 'max'])
    ranges = {}
    for index, values in zip(range_df.index, range_df.values):
        ranges[index] = {'min': values[0], 'max': values[1]}        
    dataset_df[label_col] = dataset_df.apply(lambda x: _normalize(x, ranges, 'pol_var_id', label_col), axis=1)    
    
    # Sort data
    dataset_df.index = pd.to_datetime(dataset_df.index)
    dataset_df.sort_values(by=['station_id', 'pol_var_id', 'date'], inplace=True)

    return dataset_df, feats, n_feats, cols_max

def _create_datasets(experiment, dataset_df, train_end, val_end, test_end, feats, label_col):
    # Get params
    sequence_len = experiment.get_parameter("sequence_len")
    batch_size = experiment.get_parameter("batch_size")
    
    # Split df into train and test sets
    train_df = dataset_df[dataset_df.index < pd.to_datetime(train_end)]
    val_df = dataset_df[(dataset_df.index > pd.to_datetime(train_end)) & 
                        (dataset_df.index < pd.to_datetime(val_end))]
    test_df = dataset_df[(dataset_df.index > pd.to_datetime(val_end)) & 
                         (dataset_df.index < pd.to_datetime(test_end))]
    print("Train dataset: {}, Val dataset: {}, Test dataset: {}".format(train_df.shape[0], 
                                                                        val_df.shape[0], 
                                                                        test_df.shape[0]))
    
    # Split into feats and labels
    train_X, train_y = train_df[feats].values, train_df[label_col]
    val_X, val_y = val_df[feats].values, val_df[label_col]
    test_X, test_y = test_df[feats].values, test_df[label_col]

    # Create Sliding-Windows Dataset
    train_dataset = tf.keras.preprocessing.timeseries_dataset_from_array(train_X,
                                                                         train_y,
                                                                         sequence_length=sequence_len,
                                                                         batch_size=batch_size)
    val_dataset = tf.keras.preprocessing.timeseries_dataset_from_array(val_X,
                                                                       val_y,
                                                                       sequence_length=sequence_len,
                                                                       batch_size=batch_size)
    test_dataset = tf.keras.preprocessing.timeseries_dataset_from_array(test_X,
                                                                        test_y,
                                                                        sequence_length=sequence_len,
                                                                        batch_size=batch_size)
    
    return train_dataset, val_dataset, test_dataset, test_df


In [8]:
# Model Methods

def _lstm_base_model(experiment, n_feats):
    # Get params
    sequence_len = experiment.get_parameter("sequence_len")
    layer_lstm_1_units = experiment.get_parameter("layer_lstm_1_units")
    layer_lstm_2_units = experiment.get_parameter("layer_lstm_2_units")
    layer_dense_units = experiment.get_parameter("layer_dense_units")
    dropout = experiment.get_parameter("dropout")
    
    # Create Model
    input_layer = tf.keras.layers.Input(shape=(sequence_len, n_feats))
    x = tf.keras.layers.LSTM(units=layer_lstm_1_units, 
                                  dropout=dropout, 
                                  return_sequences=True)(input_layer)
    x = tf.keras.layers.LSTM(units=layer_lstm_2_units, 
                                  dropout=dropout, 
                                  return_sequences=True)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    x = tf.keras.layers.Dense(units=layer_dense_units)(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    output_layer = tf.keras.layers.Dense(units=1)(x)
    model = tf.keras.models.Model(inputs=input_layer, outputs=output_layer) 
    return model

# Losses Methods

def _compile_mse_model(experiment, model):
    # Get params
    learning_rate = experiment.get_parameter("learning_rate")
    # Compile
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='mse', optimizer=optimizer)
    early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min', restore_best_weights=True)
    model.summary()
    return model, early_stop

# Training Methods

def _fit_model(model, epochs, train_dataset, val_dataset, callbacks):    
    history = model.fit(train_dataset, 
                        epochs=epochs,
                        validation_data=val_dataset, 
                        verbose=1, 
                        callbacks=[early_stop],
                        shuffle=True)
    return history
    
# Evaluation Methods

def _get_error(test_dataset):
    preds = model.predict(test_dataset).squeeze()
    truth = []
    for x, y in test_dataset:
        truth.extend(y.numpy())
    error = np.mean(np.abs(preds-np.array(truth)))
    return preds, error

def _feats_importances(test_dataset, feats, sequence_len):
    # Get baseline error
    feats_imp = []
    ff_preds = model.predict(test_dataset, verbose=0).squeeze()
    ff_x, ff_y = [], []
    for x, y in test_dataset:
        ff_x.extend(x.numpy())
        ff_y.extend(y.numpy())
    ff_x, ff_y = np.array(ff_x), np.array(ff_y)
    baseline_error = np.mean(np.abs(ff_preds-np.array(ff_y)))
    feats_imp.append({'feature':'BASELINE','mae': baseline_error})
    
    # Get features gain on reducing error: each value 
    for k in tqdm(range(len(feats))):
        # Change values for current feat
        save_col = ff_x[:,:,k].copy()
        ff_x[:,:,k] = [-100 for x in range(sequence_len)]
        # Compute error 
        oof_preds = model.predict(ff_x, verbose=0).squeeze() 
        mae = np.mean(np.abs(oof_preds-ff_y))
        feats_imp.append({'feature': feats[k],'mae': mae})
        ff_x[:,:,k] = save_col
    
    # Plot 
    df = pd.DataFrame(feats_imp)
    df = df.sort_values('mae')
    plt.figure(figsize=(10, 13))
    plt.barh(np.arange(len(feats)+1), df.mae)
    plt.yticks(np.arange(len(feats)+1), df.feature.values)
    plt.title('LSTM Feature Importance', size=16)
    plt.ylim((-1, len(feats)+1))
    plt.plot([baseline_error, baseline_error], [-1,len(feats)+1], '--', color='orange',
             label=f'Baseline OOF\nMAE={baseline_error:.3f}')
    plt.xlabel('MAE with feature permuted', size=14)
    plt.ylabel('Feature', size=14)
    plt.legend()
    plt.show()
    
def _predict(preds, test_df, label_col):
    try:
        # Predict
        for i in range(len(preds), test_df.shape[0]):
            preds = np.append(preds, 0.0)
        test_df['preds'] = preds
        # Plot some station & bcode
        n_sample = min(10, test_df[['station_id', 'pol_var_id']].drop_duplicates().shape[0])
        for station_id, pol_var_id in test_df[['station_id', 'pol_var_id']].drop_duplicates().sample(n_sample).values:
            curr_test_df = test_df[(test_df['station_id']==station_id) & (test_df['pol_var_id']==pol_var_id)]
            curr_test_df.sort_index(inplace=True)
            plt.figure(figsize=(15, 5))
            plt.title("{} - {}".format(station_id, pol_var_id))
            plt.plot(curr_test_df.index, curr_test_df['preds'], label='pred')
            plt.plot(curr_test_df.index, curr_test_df[label_col], label='truth')
            plt.ylim(0, 1)
            plt.legend()
            plt.show()
    except:
        pass


<h3>1. Config</h3>

<h4>1.1 Config BigQuery</h4>

In [9]:
# Setup Client

bq_client = bigquery.Client(project=PROJECT_ID)
bq_client

<google.cloud.bigquery.client.Client at 0x11069a820>

<h3>2. Read Data</h3>

<h4>2.1 Read Cluster file</h4>

In [9]:
clusters_df = pd.read_csv(OUTPUT_CLUSTER_FILENAME)
print(clusters_df.shape)
clusters_df.head(3)

(363, 4)


Unnamed: 0.1,Unnamed: 0,station_id,pol_var_id,cluster
0,0,1,B48001,4
1,1,1,B48002,3
2,2,1,B48003,0


<h4>2.2 Read Tables</h4>

<b>SLIDING_WINDOWS_DATASET</b> joins meteo features of last 7-days with the next-day pollen value.

In [10]:
# Read SLIDING_WINDOWS_DATASET

sliding_windows_dataset_df = _read_table_delta(bq_client, PROJECT_ID, JOINED_BQ_DATASET, 
                                               "SLIDING_WINDOWS_DATASET", "date",
                                               COMMON_PERIOD_INIT, COMMON_PERIOD_END)
sliding_windows_dataset_df['date'] = sliding_windows_dataset_df['date'].astype("str")
print(sliding_windows_dataset_df.shape)
sliding_windows_dataset_df.head(3)

(783015, 55)


Unnamed: 0,date,station_id,pol_var_id,date_diff,B13011_min_amin,B13011_max_amax,B13011_mean_mean,B13011_std_mean,B13011_sum_sum,B14198_min_amin,...,pol_value_prev_2,pol_value_prev_3,pol_value_prev_4,pol_value_prev_5,pol_value_prev_6,pol_value_prev_7,pol_value_label,season_label,trend_label,residual_label
463994,2011-01-02 00:00:00+00:00,8,B48029,9,0.0,11.4,0.01563,0.144917,18.6,-13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.338131,0.82937,0.508761
513242,2011-01-02 00:00:00+00:00,10,B48016,7,0.0,12.8,0.014118,0.146686,16.8,-12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.770304,3.846548,-1.076244
440463,2011-01-02 00:00:00+00:00,7,B48034,8,0.0,13.0,0.014454,0.153349,17.2,-15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.641494,6.968164,-2.32667


<h3>3. Run Experiments</h3>

We run a fine-tuning for each cluster.<br>

<h4>3.1 Config Experiment</h4>

In [48]:
# Set data

data_df = sliding_windows_dataset_df.copy()

In [49]:
# Get Clusters

clusters = sorted(clusters_df.cluster.unique())
print("Found {} clusters in data: {}".format(len(clusters), clusters))

Found 10 clusters in data: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [50]:
# Set fixed params

MODEL_NAME = 'LSTM_base'  # LSTM_base
LOSS = 'ADAM_MSE'                 # ADAM_MSE

In [51]:
# Set Hyper-parameters

max_combo = 10 
hyperparams = {
    "algorithm": "bayes",
    "parameters": {
        "sequence_len": {"type": "discrete", "values": [7, 14, 21]},
        "layer_lstm_1_units": {"type": "discrete", "values": [64, 128, 256, 512]},
        "layer_lstm_2_units": {"type": "discrete", "values": [64, 128, 256, 512]},
        "layer_dense_units": {"type": "discrete", "values": [64, 128, 256, 512]},        
        "dropout": {"type": "float", "min": 0.0, "max": 1.0},
        "batch_size": {"type": "discrete", "values": [256, 512, 1024]},
        "learning_rate": {"type": "float", "min": 0.0001, "max": 0.1},
    },
    "spec": {"maxCombo": max_combo, "objective": "minimize", "metric": "val_loss"},
    "trials": 1,
}

<h4>3.2 Run Experiment</h4>

In [53]:
# Run Experiment

errors = []
for cluster_id in tqdm(clusters):
    
    # Setup CometML Optimizer
    optimizer = Optimizer(
        hyperparams,
        api_key=COMET_API_KEY, 
        project_name=COMET_PROJECT_NAME,
        workspace=COMET_WORKSPACE,
        log_env_details=False,
        display_summary_level=0,
        verbose=0    
    )
    
    # Loop experiments
    for ith, experiment in tqdm(enumerate(optimizer.get_experiments(), start=1), total=max_combo):        
        model_id = "{}-Loss{}-Cluster{}-Exp{}".format(MODEL_NAME, LOSS, cluster_id, ith)    
        experiment.set_name(model_id)
        experiment.add_tag("cluster_id-{}".format(cluster_id))
        print("Running Experiment {}:".format(model_id))

        # Get Data
        print("\nGetting data..")
        dataset_df = _get_data(data_df, clusters_df, cluster_id, ORIGINAL_FEATS, DATE_COL, LABEL_COL)    

        # Prepare Data
        print("\nPreparing data..")
        dataset_df, FEATS, N_FEATS, cols_max = _prepare_data(dataset_df, ORIGINAL_FEATS, METEO_FEATS, 
                                                             POLLEN_FEATS, LABEL_COL)

        # Create Dataset
        print("\nCreating dataset..")
        train_dataset, val_dataset, test_dataset, test_df = _create_datasets(experiment, dataset_df, TRAIN_END, 
                                                                             VAL_END, TEST_END, FEATS, LABEL_COL)

        # Define Model
        print("\nDefining & Training model..")
        if MODEL_NAME == 'LSTM_base':
            model = _lstm_base_model(experiment, N_FEATS)
        else:
            pass

        # Compile Model
        if LOSS == 'ADAM_MSE':
            model, early_stop = _compile_mse_model(experiment, model)
        else:
            pass

        # Train Model
        history = _fit_model(model, EPOCHS, train_dataset, val_dataset, [early_stop])

        # Get Error on test-set
        preds, error = _get_error(test_dataset)
        experiment.log_other("{} test-error: {}".format(cluster_id, error))
        print("Final Error: {}".format(error))

        # Plot Features importances
        _feats_importances(test_dataset, FEATS, sequence_len=experiment.get_parameter("sequence_len"))

        # Show Prediction on test-set
        _predict(preds, test_df, LABEL_COL)

        # End experiment
        experiment.end()
    

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

COMET INFO: Optimizer metrics is 'val_loss' but no logged values found. Experiment ignored in sweep.
COMET INFO: Experiment is live on comet.com https://www.comet.com/alessioanghileri-noovle/arpae-tsd-trend-finetuning/fb51be11c22544a4921ed991b6c7f9ea



Running Experiment LSTM_Attention_v1-LossADAM_MSE-Cluster0-Exp1:

Getting data..
Rows: 93488

Preparing data..


  0%|          | 0/16 [00:00<?, ?it/s]


Creating dataset..
Train dataset: 78007, Val dataset: 7906, Test dataset: 7562

Defining & Training model..
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 14, 58)]     0           []                               
                                                                                                  
 lstm_2 (LSTM)                  (None, 14, 128)      95744       ['input_3[0][0]']                
                                                                                                  
 lstm_3 (LSTM)                  (None, 14, 256)      394240      ['lstm_2[0][0]']                 
                                                                                                  
 multi_head_attention_1 (MultiH  (None, 14, 256)     526080      ['lstm_3[0][0]', 

COMET INFO: Ignoring automatic log_parameter('verbose') because 'keras:verbose' is in COMET_LOGGING_PARAMETERS_IGNORE


Epoch 1/50
Epoch 2/50
Epoch 3/50

KeyboardInterrupt: 