importing libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, MultiHeadAttention, Add
from kerastuner.tuners import RandomSearch
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

  from kerastuner.tuners import RandomSearch


load my preprocessed dataset

In [2]:
file_path = "preprocessed_dataset_china.csv"
data = pd.read_csv(file_path)


defining features and target

In [5]:
features = ['PM2.5', 'O3', 'TEMPERATURE', 'PRESSURE', 'DEWPOINT', 'RAIN']
target = 'PM2.5'


In [6]:
X = data[features].values
y = data[target].values

train test split

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

reshaping input for my model

In [8]:
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))  # (samples, timesteps, features)
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

transformer hyper model

In [14]:
from tensorflow.keras.layers import Input

# Define the input layer
input_shape = (X_train.shape[1], X_train.shape[2])  # (timesteps, features)
input_layer = Input(shape=input_shape)


In [15]:
# Define Transformer Hyperparameters
head_size = 64   # Size of each attention head
num_heads = 4    # Number of attention heads
ff_dim = 128     # Hidden layer size in feed-forward network
num_layers = 2   # Number of Transformer encoder blocks
dropout = 0.2    # Dropout rate


In [16]:
# def build_transformer_model(hp):
#     input_layer = Input(shape=(X_train.shape[1], X_train.shape[2]))

#     head_size = hp.Choice('head_size', [32, 64, 128])
#     num_heads = hp.Int('num_heads', 2, 8, step=2)
#     ff_dim = hp.Choice('ff_dim', [64, 128, 256])
#     dropout = hp.Float('dropout', 0.1, 0.5, step=0.1)

multihead attention

In [17]:
    # attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(input_layer, input_layer)
    # attn_output = Dropout(dropout)(attn_output)
    # attn_output = LayerNormalization(epsilon=1e-6)(Add()([input_layer, attn_output]))

feed forward network

In [None]:
# ff_output = Dense(ff_dim, activation="relu")(attn_output)  
# ff_output = Dropout(dropout)(ff_output)  

# ff_output = Dense(X_train.shape[2], activation="relu")(ff_output)
# ff_output = LayerNormalization(epsilon=1e-6)(Add()([attn_output, ff_output]))

output layer

In [None]:
# output_layer = Dense(1, activation="linear")(ff_output)

# model = Model(inputs=input_layer, outputs=output_layer)
# model.compile(optimizer='adam', loss='mse', metrics=['mae'])


In [23]:
# Define Transformer HyperModel for Tuning
def build_transformer_model(hp):
    input_layer = Input(shape=(X_train.shape[1], X_train.shape[2]))

    head_size = hp.Choice('head_size', [32, 64, 128])
    num_heads = hp.Int('num_heads', 2, 8, step=2)
    ff_dim = hp.Choice('ff_dim', [64, 128, 256])
    dropout = hp.Float('dropout', 0.1, 0.5, step=0.1)

    # Multi-head attention
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(input_layer, input_layer)
    attn_output = Dropout(dropout)(attn_output)
    attn_output = LayerNormalization(epsilon=1e-6)(Add()([input_layer, attn_output]))

    # Feedforward network
    ff_output = Dense(ff_dim, activation="relu")(attn_output)
    ff_output = Dropout(dropout)(ff_output)
    ff_output = Dense(X_train.shape[2], activation="relu")(ff_output)
    ff_output = LayerNormalization(epsilon=1e-6)(Add()([attn_output, ff_output]))

    # Output layer
    output_layer = Dense(1, activation="linear")(ff_output)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    return model

Keras tuner

In [24]:
tuner = RandomSearch(
    build_transformer_model,
    objective="val_loss",
    max_trials=5,
    executions_per_trial=1,
    directory="tuner_results",
    project_name="transformer_tuning"
)

run hyper parameter tuning

In [25]:
tuner.search(X_train, y_train, epochs=50, validation_split=0.2, batch_size=8)

Trial 5 Complete [00h 05m 11s]
val_loss: 0.0073427497409284115

Best val_loss So Far: 0.007340245880186558
Total elapsed time: 00h 27m 09s


getting best hyper parmeters

In [26]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best Hyperparameters: head_size={best_hps.get('head_size')}, num_heads={best_hps.get('num_heads')}, ff_dim={best_hps.get('ff_dim')}, dropout={best_hps.get('dropout')}")

Best Hyperparameters: head_size=128, num_heads=6, ff_dim=256, dropout=0.4


train best model

In [27]:
best_model = tuner.hypermodel.build(best_hps)
best_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 0.3301 - mae: 0.2893 - val_loss: 0.0076 - val_mae: 0.0653
Epoch 2/50
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.0109 - mae: 0.0774 - val_loss: 0.0077 - val_mae: 0.0595
Epoch 3/50
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0088 - mae: 0.0684 - val_loss: 0.0075 - val_mae: 0.0606
Epoch 4/50
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0081 - mae: 0.0652 - val_loss: 0.0076 - val_mae: 0.0669
Epoch 5/50
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0080 - mae: 0.0650 - val_loss: 0.0075 - val_mae: 0.0622
Epoch 6/50
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0079 - mae: 0.0647 - val_loss: 0.0076 - val_mae: 0.0659
Epoch 7/50
[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - 

<keras.src.callbacks.history.History at 0x1c886558080>

Evaluate the model

In [28]:
y_pred = best_model.predict(X_test).flatten()

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R² Score: {r2}")

[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Mean Squared Error (MSE): 0.007321222229794123
Root Mean Squared Error (RMSE): 0.08556414102761813
Mean Absolute Error (MAE): 0.06283463243641549
R² Score: 0.03229414319312118


implementing K-fold cross validation

In [29]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = []

for train_idx, val_idx in kf.split(X):
    X_train_cv, X_val_cv = X[train_idx], X[val_idx]
    y_train_cv, y_val_cv = y[train_idx], y[val_idx]

    X_train_cv = X_train_cv.reshape((X_train_cv.shape[0], 1, X_train_cv.shape[1]))
    X_val_cv = X_val_cv.reshape((X_val_cv.shape[0], 1, X_val_cv.shape[1]))

    model_cv = tuner.hypermodel.build(best_hps)
    model_cv.fit(X_train_cv, y_train_cv, epochs=20, batch_size=32, validation_data=(X_val_cv, y_val_cv))

    y_val_pred = model_cv.predict(X_val_cv).flatten()
    fold_rmse = np.sqrt(mean_squared_error(y_val_cv, y_val_pred))
    cross_val_scores.append(fold_rmse)

print(f"Cross-Validation RMSE Scores: {cross_val_scores}")
print(f"Mean CV RMSE: {np.mean(cross_val_scores)}")

Epoch 1/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.0762 - mae: 0.1559 - val_loss: 0.0089 - val_mae: 0.0782
Epoch 2/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0084 - mae: 0.0664 - val_loss: 0.0076 - val_mae: 0.0641
Epoch 3/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.0078 - mae: 0.0642 - val_loss: 0.0078 - val_mae: 0.0681
Epoch 4/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.0079 - mae: 0.0646 - val_loss: 0.0076 - val_mae: 0.0645
Epoch 5/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.0077 - mae: 0.0637 - val_loss: 0.0079 - val_mae: 0.0694
Epoch 6/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.0078 - mae: 0.0642 - val_loss: 0.0078 - val_mae: 0.0685
Epoch 7/20
[1m842/842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - 