# Model Training - Air Quality Prediction
This notebook trains ML and DL models to predict air quality levels.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator


In [None]:

# Load dataset (assuming file is inside dataset/ folder)
df = pd.read_csv("dataset/air_quality.csv")

print("Shape of dataset:", df.shape)
df.head()


In [None]:

# Select features and target (example: predict PM2.5 levels)
target_col = "PM2.5"
features = df.drop(columns=[target_col])
target = df[target_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)
print("Train size:", X_train.shape, "Test size:", X_test.shape)


In [None]:

# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluation metrics
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Results:")
print("RMSE:", rmse_rf)
print("MAE:", mae_rf)
print("R²:", r2_rf)

# Plot Actual vs Predicted
plt.figure(figsize=(8,5))
plt.scatter(y_test, y_pred_rf, alpha=0.5)
plt.xlabel("Actual PM2.5")
plt.ylabel("Predicted PM2.5")
plt.title("Random Forest - Actual vs Predicted")
plt.show()


In [None]:

# Prepare data for LSTM (only using target column for simplicity)
series = df[[target_col]].values

# Train-test split for LSTM
train_size = int(len(series) * 0.7)
train, test = series[:train_size], series[train_size:]

# Timeseries generator
win_size = 10
train_gen = TimeseriesGenerator(train, train, length=win_size, batch_size=32)
test_gen = TimeseriesGenerator(test, test, length=win_size, batch_size=32)

# Build LSTM model
lstm_model = Sequential([
    LSTM(50, activation='relu', input_shape=(win_size, 1)),
    Dense(1)
])

lstm_model.compile(optimizer='adam', loss='mse')
history = lstm_model.fit(train_gen, epochs=10, validation_data=test_gen, verbose=1)

# Predictions
y_pred_lstm = lstm_model.predict(test_gen)

# Align test values
y_test_lstm = test[win_size:]

# Metrics
rmse_lstm = np.sqrt(mean_squared_error(y_test_lstm, y_pred_lstm))
mae_lstm = mean_absolute_error(y_test_lstm, y_pred_lstm)
r2_lstm = r2_score(y_test_lstm, y_pred_lstm)

print("LSTM Results:")
print("RMSE:", rmse_lstm)
print("MAE:", mae_lstm)
print("R²:", r2_lstm)

# Plot Actual vs Predicted for LSTM
plt.figure(figsize=(8,5))
plt.plot(y_test_lstm, label="Actual")
plt.plot(y_pred_lstm, label="Predicted")
plt.legend()
plt.title("LSTM - Actual vs Predicted")
plt.show()
