# Import and load

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import missingno as msno
import plotly.express as px

from tqdm import tqdm

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

from keras.callbacks import EarlyStopping

from keras.layers import Dense, LSTM, Dropout
from keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler





In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Feature engineering + Lag

In [3]:
def create_lag(df, num_lags):
    df['year'] = pd.to_datetime(df['Date']).dt.year
    df['month'] = pd.to_datetime(df['Date']).dt.month
    df['day'] = pd.to_datetime(df['Date']).dt.day
    df['Date'] = pd.to_datetime(df['Date'])    
    df.set_index('Date', inplace=True)
    df.sort_values(by=['ID', 'Date'], inplace=True)
    for i in range(1, num_lags+1):
        df[f'lag{i}'] = df.groupby('ID')['GT_NO2'].shift(i)

    return df

def full_df(df):
    df = df.copy().reset_index()
    df = df.drop(['ID', 'ID_Zindi', 'Date', 'NO2_trop'],axis=1)
    return df

In [4]:
trained = create_lag(train, 15)
trained = full_df(trained)

  df['year'] = pd.to_datetime(df['Date']).dt.year
  df['month'] = pd.to_datetime(df['Date']).dt.month
  df['day'] = pd.to_datetime(df['Date']).dt.day
  df['Date'] = pd.to_datetime(df['Date'])


# Model training + validation

* Min RMSE

In [5]:
rts = TimeSeriesSplit(max_train_size=5000)

sub_set = trained.dropna()

x = sub_set.drop('GT_NO2', axis=1)
y = sub_set['GT_NO2']

In [6]:
seq_length = 15

def create_sequences(data, labels, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x_seq = data[i:i+seq_length]
        y_seq = labels.iloc[i+seq_length]
        xs.append(x_seq)
        ys.append(y_seq)
    return np.array(xs), np.array(ys)

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(x)
X, y = create_sequences(scaled_features, y, seq_length)

In [7]:
def my_model(input_shape):
    model = Sequential()
    model.add(LSTM(128, activation='relu', input_shape=input_shape, return_sequences=True))
    model.add(Dropout(0.2))  # Dropout layer added
    model.add(LSTM(64, activation='relu', return_sequences=False))
    model.add(Dropout(0.2))  # Dropout layer added
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    return model

input_shape = (seq_length, X.shape[2])
model = my_model(input_shape)
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 15, 128)           79872     
                                                                 
 dropout (Dropout)           (None, 15, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 16)                528       
                                                                 
 dense_2 (Dense)             (None, 1)                

In [8]:
# Initialize TimeSeriesSplit with max_train_size
tscv = TimeSeriesSplit(max_train_size=5000)

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Training and evaluation loop
for fold, (train_index, test_index) in enumerate(tscv.split(X), 1):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = my_model(input_shape)
    
    # Train the model with early stopping and validation data
    history = model.fit(
        X_train, y_train,
        epochs=200,
        batch_size=32,
        verbose=1,
        validation_split=0.1,  # Use 10% of training data for validation
        callbacks=[early_stopping]
    )
    
    # Print training loss and validation loss from the last epoch
    train_loss = history.history['loss'][-1]
    val_loss = history.history['val_loss'][-1]
    print(f'Fold {fold} - Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
    
    # Evaluate the model on the test set
    y_pred = model.predict(X_test)
    test_loss = mean_squared_error(y_test, y_pred)
    print(f'Fold {fold} - Test Loss (MSE): {test_loss:.4f}')

Epoch 1/200

Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Fold 1 - Training Loss: 75.4237, Validation Loss: 95.4392
Fold 1 - Test Loss (MSE): 110.9634
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Fold 2 - Train

In [16]:
len(history.history['loss'])


21