## Long Short Term Memory on Weather Station Data

In [1]:
# import dependencies
import pandas as pd
import numpy as np
import sqlalchemy as sq
import sys
import os
import pickle
from imblearn.combine import SMOTEENN
from imblearn.ensemble import (  # type: ignore
    RUSBoostClassifier,
)

from sklearn.metrics import (  # type: ignore
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout

sys.path.append("../../")
os.chdir("../../")
from ModelBuilderMethods import getConn, extractYears

2023-08-07 15:08:29.068053: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# unlimited line output
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 500)

### <u>**Step 1**</u>: Data Selection

In this step, we would choose the particular data/table, pick attributes from existing tables. Further aggregation/feature engineer can be done here to support the point of the research.

Particular, for this notebook, we grab the following data and merge them (on year, district) into a single table:
- Monthly weather station
- ergot data (downgrade)

In [3]:
# Set the query text
weatherStationQuery = sq.text(
    """
    SELECT * from dataset_cross_monthly_station
"""
)

ergotTargetQuery = sq.text(
    """
    SELECT year, district, downgrade from ergot_sample_feat_eng
"""
)

In [4]:
conn = getConn()

stationDf = pd.read_sql(weatherStationQuery, conn)
ergotTargetDf = pd.read_sql(ergotTargetQuery, conn)

conn.close()
del conn

In [5]:
tempdf = stationDf

# merge on year and district
datasetDf = pd.merge(ergotTargetDf, tempdf, on=["year", "district"], how="left")
del ergotTargetDf
del tempdf

In [6]:
# encode district
datasetDf["district"] = datasetDf["district"].astype("category")

temp = pd.get_dummies(datasetDf["district"], prefix="district", drop_first=True)
datasetDf = pd.concat([datasetDf, temp], axis=1)

datasetDf = datasetDf.drop(columns=["district"])

del temp

### <u>**Step 2**</u>: Splitting dataset

- We split the whole dataset into the train/test split. Particularly, split them by year (1995 - 2015 for training, 2016 - 2020 for testing) since this is a time series data.

In [7]:
# train 1995 - 2015 test 2016 - 2020
trainDf = extractYears(datasetDf, 1995, 2015)
testDf = extractYears(datasetDf, 2016, 2020)
del datasetDf

In [8]:
# drop year
trainDf = trainDf.drop(columns=["year"])
testDf = testDf.drop(columns=["year"])

### <u>**Step 3**</u>: [Balancing the dataset](https://imbalanced-learn.org/stable/)

- Our dataset is unbalanced and can lead to bias when training/testing. Balacing step would help to eliminate the bias of the dataset, thus provide more reliable results.

In [9]:
# pre balancing check
# print value counts downgrade
print(trainDf["downgrade"].value_counts())
print(testDf["downgrade"].value_counts())

downgrade
False    122202
True       2082
Name: count, dtype: int64
downgrade
False    26307
True      1016
Name: count, dtype: int64


In [10]:
# count nan
print(trainDf.isna().sum())
# set nan to 0
trainDf = trainDf.fillna(0)

downgrade                    0
1:min_temp_x              1246
1:max_temp_x              1246
1:mean_temp_x             1246
1:min_dew_point_temp      1246
1:max_dew_point_temp      1246
1:mean_dew_point_temp     1246
1:min_humidex             1246
1:max_humidex             1246
1:mean_humidex            1246
1:min_precip              1246
1:max_precip              1246
1:mean_precip             1246
1:min_rel_humid           1246
1:max_rel_humid           1246
1:mean_rel_humid          1246
1:min_stn_press           1246
1:max_stn_press           1246
1:mean_stn_press          1246
1:min_visibility          1246
1:max_visibility          1246
1:mean_visibility         1246
1:max_temp_y              1246
1:min_temp_y              1246
1:mean_temp_y             1246
1:min_total_rain          1246
1:max_total_rain          1246
1:mean_total_rain         1246
1:min_total_snow          1246
1:max_total_snow          1246
1:mean_total_snow         1246
1:min_total_precip        1246
1:max_to

In [11]:
balancer = SMOTEENN(sampling_strategy=1, random_state=42)
balancedTrainDfX, balancedTrainDfY = balancer.fit_resample(
    trainDf.drop(columns="downgrade"), trainDf["downgrade"]
)

In [12]:
# post balancing check
# print value counts downgrade
print(balancedTrainDfY.value_counts())

downgrade
False    115179
True      23757
Name: count, dtype: int64


### <u>**Step 4**</u>: Regularization / Normalization
some blurb about scalers  

1. [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)             
2. [MaxAbsScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html)  
3. [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)  
4. [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)  
5. [Normalizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html)  
6. [PowerTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html)  
7. [QuantileTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html)  

### <u>**Step 5**</u>: Long Short Term Memory Model

##### <u>**Step 5.0**</u>: Create input-output pair

In [13]:
def create_io_pair(
    X_train: np.ndarray, Y_train: np.ndarray, k=1
) -> "tuple(np.ndarray, np.ndarray)":
    """
    k: time step
    return: (input, output) pairs from given data
    """
    windows = []
    windows_y = []

    for i, sequence in enumerate(X_train):
        len_seq = len(sequence)
        for window_start in range(0, len_seq - k + 1):
            window_end = window_start + k
            window = sequence[window_start:window_end]
            windows.append(window)
            windows_y.append(Y_train[i])
    return (np.array(windows), np.array(windows_y))

In [14]:
X_train = np.array(trainDf.drop(columns=["downgrade"]))
Y_train = np.array(trainDf["downgrade"])

In [None]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)

In [15]:
x_train, y_train = create_io_pair(X_train, Y_train, k=100)

##### <u>**Step 5.1**</u>: Initialize the model

In [None]:
def LSTM_model(n_input, n_output, units=50, dropout_rate=0.2, optimizer="adam"):
    # using sequential to build LSTM model
    model = Sequential()

    # Adding the first LSTM layer and some Dropout regularisation
    model.add(LSTM(units=units, return_sequences=True, input_shape=(n_input, 1)))
    model.add(Dropout(dropout_rate))

    # Adding a second LSTM layer and some Dropout regularisation
    model.add(LSTM(units=units, return_sequences=True))
    model.add(Dropout(dropout_rate))

    # Adding a third LSTM layer and some Dropout regularisation
    model.add(LSTM(units=units, return_sequences=True))
    model.add(Dropout(dropout_rate))

    # Adding a fourth LSTM layer and some Dropout regularisation
    model.add(LSTM(units=units))
    model.add(Dropout(dropout_rate))

    # Adding the output layer
    model.add(Dense(units=n_output))

    # Compiling the RNN
    model.compile(optimizer=optimizer, loss="mean_absolute_error")

    return model

##### <u>**Step 5.2**</u>: Fit the training data to the model

In [None]:
model = LSTM_model(x_train.shape[1], x_train.shape[2], units=50, dropout=0.2, optimizer='adam')

##### <u>**Step 5.3**</u>: Test the model on the testing dataset