In [7]:
%pip install numpy pandas matplotlib seaborn scikit-learn tensorflow keras keras-tuner


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement tensorflow (from versions: none)

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for tensorflow


In [6]:
# -----------------------------------------------------------
# 2. Imports
# -----------------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (confusion_matrix, classification_report, roc_auc_score, roc_curve, 
                             precision_recall_fscore_support, accuracy_score)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from keras.utils import to_categorical

import keras_tuner as kt
import os

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# -----------------------------------------------------------
# 3. Data Loading & EDA
# -----------------------------------------------------------
df = pd.read_csv('balanced_dataset.csv')
df.head()

In [None]:
# Check for missing values and class distribution
print(df.isnull().sum())
print(df['City'].value_counts())

## Data Preprocessing

- Encode target labels
- Scale features
- Reshape for LSTM (samples, timesteps, features)


In [None]:
# Encode target
le = LabelEncoder()
df['City_enc'] = le.fit_transform(df['City'])
num_classes = df['City_enc'].nunique()

# Features and target
X = df.drop(['City', 'City_enc'], axis=1).values
y = df['City_enc'].values

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# LSTM expects 3D input: (samples, timesteps, features)
# We'll use timesteps=1 (can be tuned for sequence data)
X_lstm = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# One-hot encode target for Keras
y_cat = to_categorical(y, num_classes=num_classes)

In [None]:
# -----------------------------------------------------------
# 4. Model Building Function
# -----------------------------------------------------------
def build_lstm_model(hp):
    model = keras.Sequential()
    # LSTM layer
    model.add(layers.LSTM(
        units=hp.Int('units', min_value=32, max_value=128, step=32),
        input_shape=(X_lstm.shape[1], X_lstm.shape[2]),
        return_sequences=False,
        kernel_regularizer=regularizers.l2(hp.Choice('l2', [0.0, 1e-4, 1e-3]))
    ))
    # Dropout
    model.add(layers.Dropout(hp.Float('dropout', 0.2, 0.5, step=0.1)))
    # Dense output
    model.add(layers.Dense(num_classes, activation='softmax'))
    # Compile
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=hp.Choice('lr', [1e-2, 1e-3, 1e-4])),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

## Hyperparameter Tuning with Keras Tuner

We use Keras Tuner to find the best LSTM units, dropout, L2 regularization, and learning rate.

In [None]:
# -----------------------------------------------------------
# 5. Hyperparameter Tuning (on a single fold for speed)
# -----------------------------------------------------------
tuner = kt.RandomSearch(
    build_lstm_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=1,
    directory='kt_dir',
    project_name='lstm_aqi'
)

# Use a small validation split for tuning
tuner.search(X_lstm, y_cat, epochs=20, validation_split=0.2, verbose=1,
             callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)])

best_hp = tuner.get_best_hyperparameters(1)[0]
print("Best hyperparameters:", best_hp.values)

## Stratified K-Fold Cross-Validation

We use the best hyperparameters found above for all folds.

In [None]:
# -----------------------------------------------------------
# 6. K-Fold Cross-Validation
# -----------------------------------------------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

histories = []
fold_metrics = []
best_val_acc = 0
best_model = None

for fold, (train_idx, val_idx) in enumerate(skf.split(X_lstm, y)):
    print(f"\n--- Fold {fold+1} ---")
    X_train, X_val = X_lstm[train_idx], X_lstm[val_idx]
    y_train, y_val = y_cat[train_idx], y_cat[val_idx]
    y_val_labels = y[val_idx]
    
    # Build model with best hyperparameters
    model = build_lstm_model(best_hp)
    
    # Early stopping
    es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # Train
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=best_hp.get('batch_size', 32),
        callbacks=[es],
        verbose=1
    )
    histories.append(history)
    
    # Predict
    y_pred_prob = model.predict(X_val)
    y_pred = np.argmax(y_pred_prob, axis=1)
    
    # Metrics
    acc = accuracy_score(y_val_labels, y_pred)
    pr, rc, f1, _ = precision_recall_fscore_support(y_val_labels, y_pred, average=None, zero_division=0)
    cm = confusion_matrix(y_val_labels, y_pred)
    try:
        auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
    except:
        auc = None
    
    fold_metrics.append({
        'accuracy': acc,
        'precision': pr,
        'recall': rc,
        'f1': f1,
        'auc': auc,
        'cm': cm,
        'y_val': y_val_labels,
        'y_pred': y_pred,
        'y_pred_prob': y_pred_prob
    })
    
    # Save best model
    val_acc = max(history.history['val_accuracy'])
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model = model
        model.save('best_lstm_model.h5')

## Results Visualization

We plot training/validation loss and accuracy, confusion matrix, and ROC curve for each fold.

In [None]:
# -----------------------------------------------------------
# 7. Visualization & Aggregation
# -----------------------------------------------------------
for i, (history, metrics) in enumerate(zip(histories, fold_metrics)):
    # Loss & Accuracy
    plt.figure(figsize=(12,4))
    plt.subplot(1,2,1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title(f'Fold {i+1} - Loss')
    plt.legend()
    plt.subplot(1,2,2)
    plt.plot(history.history['accuracy'], label='Train Acc')
    plt.plot(history.history['val_accuracy'], label='Val Acc')
    plt.title(f'Fold {i+1} - Accuracy')
    plt.legend()
    plt.show()
    
    # Confusion Matrix
    plt.figure(figsize=(6,5))
    sns.heatmap(metrics['cm'], annot=True, fmt='d', cmap='Blues',
                xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title(f'Fold {i+1} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    
    # ROC Curve (macro-average)
    if metrics['auc'] is not None:
        fpr = dict()
        tpr = dict()
        for j in range(num_classes):
            fpr[j], tpr[j], _ = roc_curve(metrics['y_val']==j, metrics['y_pred_prob'][:,j])
            plt.plot(fpr[j], tpr[j], label=f'Class {le.classes_[j]}')
        plt.plot([0,1],[0,1],'k--')
        plt.title(f'Fold {i+1} - ROC Curve')
        plt.xlabel('FPR')
        plt.ylabel('TPR')
        plt.legend()
        plt.show()

In [None]:
# Aggregate metrics
accs = [m['accuracy'] for m in fold_metrics]
f1s = [np.mean(m['f1']) for m in fold_metrics]
aucs = [m['auc'] for m in fold_metrics if m['auc'] is not None]

print(f"Average Accuracy: {np.mean(accs):.4f} ± {np.std(accs):.4f}")
print(f"Average F1-score: {np.mean(f1s):.4f} ± {np.std(f1s):.4f}")
if aucs:
    print(f"Average AUC: {np.mean(aucs):.4f} ± {np.std(aucs):.4f}")