In [2]:
# pip install pandas numpy scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: C:\Users\thhnp\Thesis PM25 Prediction\tfvenv\Scripts\python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import r2_score, mean_squared_error

In [6]:
# --- 1. CONFIGURATION ---
FILE_PATH = './datasets/ratnapark_pm25_after_imputation.csv'
TARGET_COL = 'PM2.5'
PRED_WINDOW = 24  # Predict the peak value in the next 24 hours
LOOKBACK_WINDOW = 48 # Number of previous hours used as input sequence
METEO_FEATURES = ['PS', 'WS2M', 'RH2M', 'T2M'] 
# Features to be included in the GRU sequence input
GRU_SEQUENCE_FEATURES = [TARGET_COL] + METEO_FEATURES 

# --- 2. DATA LOADING AND TARGET ENGINEERING ---
print("1. Loading Data and Engineering Target...")
df = pd.read_csv(FILE_PATH, index_col=0)
# df['Datetime'] = pd.to_datetime(df[['YEAR', 'MO', 'DY', 'HR']])
df['date'] = pd.to_datetime(df.rename(columns={'YEAR': 'year', 'MO': 'month', 'DY': 'day'})[['year', 'month', 'day']])
df = df.set_index('date').sort_index()
df = df[GRU_SEQUENCE_FEATURES].copy()

1. Loading Data and Engineering Target...


In [7]:
# Create the Regression Target: Max PM2.5 in the next 24 hours
df['Peak_Value_Target'] = np.nan
for i in range(len(df) - PRED_WINDOW):
    # Target is the max value in the window [t+1, t+24]
    df.loc[df.index[i], 'Peak_Value_Target'] = df[TARGET_COL].iloc[i+1 : i+1+PRED_WINDOW].max()

df = df.dropna()

In [8]:
# --- 3. SCALING AND SEQUENCE CREATION ---

# Input Features (X) and Target (Y)
X_data = df[GRU_SEQUENCE_FEATURES].values
Y_data = df['Peak_Value_Target'].values

# Scale the input features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_data)

# Function to create time series sequences for GRU
def create_sequences(X, y, time_steps):
    X_seq, Y_seq = [], []
    for i in range(len(X) - time_steps):
        # Sequence input: t to t + time_steps - 1
        X_seq.append(X[i:i + time_steps])
        # Target: value at time t + time_steps (which corresponds to the peak prediction)
        Y_seq.append(y[i + time_steps - 1]) 
    return np.array(X_seq), np.array(Y_seq)

X_seq, Y_seq = create_sequences(X_scaled, Y_data, LOOKBACK_WINDOW)


In [9]:

# --- 4. TRAIN/TEST SPLIT (Time-Series Split) ---
split_point = int(len(X_seq) * 0.8)

X_train = X_seq[:split_point]
Y_train = Y_seq[:split_point]
X_test = X_seq[split_point:]
Y_test = Y_seq[split_point:]

print(f"2. Data Shapes:")
print(f"   Input Sequence Shape: {X_train.shape} (Samples, Timesteps, Features)")

2. Data Shapes:
   Input Sequence Shape: (20985, 48, 5) (Samples, Timesteps, Features)


In [16]:
# --- 5. GRU MODEL DEFINITION AND TRAINING ---
print("3. Defining and Training GRU Model...")

# Input shape: (LOOKBACK_WINDOW, num_features)
input_shape = (X_train.shape[1], X_train.shape[2]) 

model = Sequential([
    # GRU Layer 1: Returns sequences to feed into the next GRU layer
    GRU(units=128, return_sequences=True, input_shape=input_shape),
    Dropout(0.3),
    # GRU Layer 2: Returns only the final output (sequence context vector)
    GRU(units=64, return_sequences=False),
    Dropout(0.3),
    # Dense output layer for single value regression
    Dense(units=1, activation='linear')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Setup early stopping to prevent overfitting
es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(
    X_train, Y_train,
    epochs=20, 
    batch_size=32, 
    validation_split=0.1, # 10% of training data for validation
    callbacks=[es], 
    verbose=1
)

3. Defining and Training GRU Model...
Epoch 1/20


  super().__init__(**kwargs)


[1m591/591[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 50ms/step - loss: 3762.1760 - val_loss: 5105.8428
Epoch 2/20
[1m591/591[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 58ms/step - loss: 2261.3784 - val_loss: 3299.4121
Epoch 3/20
[1m591/591[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 49ms/step - loss: 2011.8490 - val_loss: 2785.3586
Epoch 4/20
[1m591/591[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 49ms/step - loss: 1994.0665 - val_loss: 2719.0215
Epoch 5/20
[1m591/591[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 50ms/step - loss: 1947.6743 - val_loss: 2441.3892
Epoch 6/20
[1m591/591[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 49ms/step - loss: 1144.9845 - val_loss: 1117.8857
Epoch 7/20
[1m591/591[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 51ms/step - loss: 680.0908 - val_loss: 629.4241
Epoch 8/20
[1m591/591[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 54ms/step - loss: 521.1373 - val_loss: 

In [17]:
# --- 6. PERFORMANCE EVALUATION ---
print("\n4. Evaluating Model Performance...")

# Generate predictions
Y_pred = model.predict(X_test).flatten()


4. Evaluating Model Performance...
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step


In [22]:
Y_pred

array([ 28.049723,  27.9947  ,  27.995798, ..., 117.659546, 125.06389 ,
       130.63939 ], shape=(5247,), dtype=float32)

In [24]:
# Calculate Metrics
r2 = r2_score(Y_test, Y_pred)
# rmse = mean_squared_error(Y_test, Y_pred, squared=False)
mae = np.mean(np.abs(Y_test - Y_pred)) # Mean Absolute Error

print("\n--- GRU Regression Metrics (Peak PM2.5) ---")
print(f"R-squared (R2) Score: {r2:.4f}")
# print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")

# Example: Display the first 10 predictions vs actual
comparison_df = pd.DataFrame({
    'Actual Peak PM2.5': Y_test[:10],
    'Predicted Peak PM2.5': Y_pred[:10]
})
print("\nFirst 10 Actual vs. Predicted Values:")
print(comparison_df.round(2))


--- GRU Regression Metrics (Peak PM2.5) ---
R-squared (R2) Score: 0.9016
Mean Absolute Error (MAE): 9.12

First 10 Actual vs. Predicted Values:
   Actual Peak PM2.5  Predicted Peak PM2.5
0              20.29             28.049999
1              20.29             27.990000
2              20.29             28.000000
3              20.29             28.700001
4              20.29             28.900000
5              20.29             28.490000
6              20.29             28.180000
7              20.29             28.180000
8              20.29             28.430000
9              20.29             28.940001


In [25]:
Y_test

array([ 20.29166667,  20.29166667,  20.29166667, ..., 126.0400003 ,
       126.0400003 , 126.0400003 ], shape=(5247,))

In [26]:
Y_pred

array([ 28.049723,  27.9947  ,  27.995798, ..., 117.659546, 125.06389 ,
       130.63939 ], shape=(5247,), dtype=float32)