In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

tf.random.set_seed(3)
np.random.seed(3)

In [None]:

file_path_2024 = r"/datasetAQI_daily_city_level_delhi_2024_delhi_2024.xlsx"
file_path_2025 = r"/dataset/AQI_daily_city_level_delhi_2025_delhi_2025.xlsx"
model_path = "/kaggle/working/best_aqi_model.h5"

In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June',
          'July', 'August', 'September', 'October', 'November', 'December']

def process_aqi_excel(file_path, year_data):
    if not os.path.exists(file_path):
        print(f"Error: File not found at '{file_path}'. Please ensure the path is correct.")
        return pd.DataFrame()

    try:
        df = pd.read_excel(file_path, sheet_name='AQI')
    except Exception as e:
        print(f"Error reading Excel file '{file_path}': {e}")
        return pd.DataFrame()

    df_aqi = df[pd.to_numeric(df['Day'], errors='coerce').notna()].copy()
    df_aqi['Day'] = df_aqi['Day'].astype(int)
    df_melted = df_aqi.melt(id_vars=['Day'], value_vars=months,
                            var_name='Month', value_name='AQI')

    month_to_num = {month: i + 1 for i, month in enumerate(months)}
    df_melted['Month_Num'] = df_melted['Month'].map(month_to_num)
    df_melted['Year'] = year_data
    df_melted.dropna(subset=['Month_Num', 'Day'], inplace=True)
    df_melted['Month_Num'] = df_melted['Month_Num'].astype(int)

    df_melted['Date'] = pd.to_datetime(
        dict(year=df_melted['Year'], month=df_melted['Month_Num'], day=df_melted['Day']),
        errors='coerce'
    )
    df_melted.dropna(subset=['Date'], inplace=True)
    df_melted['AQI'] = pd.to_numeric(df_melted['AQI'], errors='coerce')
    time_series_data = df_melted[['Date', 'AQI']].reset_index(drop=True)
    return time_series_data

print("Processing 2024 historical data...")
time_series_2024 = process_aqi_excel(file_path_2024, 2024)

print("Processing 2025 historical data...")
time_series_2025 = process_aqi_excel(file_path_2025, 2025)

all_data = pd.concat([time_series_2024, time_series_2025], ignore_index=True)
all_data.sort_values(by='Date', inplace=True)
all_data.drop_duplicates(subset=['Date'], inplace=True)

all_data_indexed = all_data.set_index('Date')
all_data_indexed['AQI_Prev_Year'] = all_data_indexed['AQI'].shift(periods=365, freq='D')
all_data = all_data_indexed.reset_index()

training_end_date = pd.Timestamp('2025-03-31')
full_training_data = all_data[all_data['Date'] <= training_end_date].copy()

print(f"\nTotal data points for training (Jan 2024 - Mar 2025): {len(full_training_data)}")

full_training_data['AQI'].fillna(method='ffill', inplace=True)
full_training_data['AQI'].fillna(method='bfill', inplace=True)
full_training_data['AQI_Prev_Year'].fillna(method='ffill', inplace=True)
full_training_data['AQI_Prev_Year'].fillna(method='bfill', inplace=True)
full_training_data['AQI_Prev_Year'].fillna(0, inplace=True)

scaler = MinMaxScaler(feature_range=(0, 1))
scaled_training_data = scaler.fit_transform(full_training_data[['AQI', 'AQI_Prev_Year']].values)

seq_len = 60
def create_sequences(data, seq_len):
    X, y = [], []
    for i in range(len(data) - seq_len):
        X.append(data[i:(i + seq_len), :])
        y.append(data[i + seq_len, 0])
    return np.array(X), np.array(y)

X_train, y_train = create_sequences(scaled_training_data, seq_len)

if X_train.size == 0:
    print("Error: Not enough data to create sequences for the model.")
    exit()

num_features = scaled_training_data.shape[1]
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], num_features)

print("\nBuilding and Training RNN Model on the training dataset...")
model = Sequential([
    LSTM(50, activation='tanh', input_shape=(seq_len, num_features)),
    Dropout(0.2),
    Dense(1)
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

model.fit(X_train, y_train, epochs=500, batch_size=32, verbose=1)

print("RNN Model Training Complete.")
model.save(model_path)
print(f"Model saved to {model_path}")

print("\nAnalyzing Model Fit on Historical Data...")
train_predictions_scaled = model.predict(X_train, verbose=0)
train_predictions_original_scale = scaler.inverse_transform(np.hstack((train_predictions_scaled, np.zeros_like(train_predictions_scaled))))[:, 0]
y_original_scale = scaler.inverse_transform(np.hstack((y_train.reshape(-1, 1), np.zeros_like(y_train.reshape(-1, 1)))))[:, 0]

train_mse = mean_squared_error(y_original_scale, train_predictions_original_scale)
train_mae = mean_absolute_error(y_original_scale, train_predictions_original_scale)
print(f"Training MSE: {train_mse:.2f}")
print(f"Training MAE: {train_mae:.2f}")

print("\nGenerating Future AQI Predictions (April 2025 - Dec 2025)...")

last_sequence_features = scaled_training_data[-seq_len:]
current_input = last_sequence_features.reshape((1, seq_len, num_features))

predicted_aqi_scaled = []
start_prediction_date = training_end_date + pd.Timedelta(days=1)
end_prediction_date = pd.Timestamp('2025-12-31')
prediction_dates = pd.date_range(start=start_prediction_date, end=end_prediction_date, freq='D')

all_data_indexed = all_data.set_index('Date')

for pred_date in prediction_dates:
    prev_year_date = pred_date - pd.DateOffset(years=1)
    
    if prev_year_date in all_data_indexed.index:
        prev_year_aqi_actual = all_data_indexed.loc[prev_year_date, 'AQI']
        if pd.isna(prev_year_aqi_actual):
            prev_year_aqi_actual = 0
    else:
        prev_year_aqi_actual = 0
    
    prev_year_aqi_actual = float(prev_year_aqi_actual)
    
    scaled_prev_year_aqi = scaler.transform([[0, prev_year_aqi_actual]])[0, 1]

    next_day_pred_scaled = model.predict(current_input, verbose=0)[0, 0]
    predicted_aqi_scaled.append(next_day_pred_scaled)

    new_feature_vector = np.array([[next_day_pred_scaled, scaled_prev_year_aqi]])
    
    current_input = np.append(current_input[:, 1:, :], new_feature_vector.reshape(1, 1, num_features), axis=1)

predicted_aqi = scaler.inverse_transform(np.hstack((np.array(predicted_aqi_scaled).reshape(-1, 1), np.zeros_like(np.array(predicted_aqi_scaled).reshape(-1, 1)))))[:, 0]
predicted_aqi[predicted_aqi < 0] = 0

predicted_df = pd.DataFrame({
    'Date': prediction_dates,
    'AQI_Predicted_Future': predicted_aqi.flatten()
})

print("Future AQI Predictions Complete. Sample predictions:")
print(predicted_df.head())
print(predicted_df.tail())

plot_df = pd.merge(all_data[['Date', 'AQI']], predicted_df, on='Date', how='outer')
plot_df.rename(columns={'AQI': 'AQI_Actual_Historical'}, inplace=True)
plot_df.sort_values('Date', inplace=True)

train_fit_dates = full_training_data['Date'].iloc[seq_len:]
train_fit_df = pd.DataFrame({
    'Date': train_fit_dates,
    'AQI_Fit_Train': train_predictions_original_scale.flatten()
})
plot_df = pd.merge(plot_df, train_fit_df, on='Date', how='left')

plt.figure(figsize=(18, 8))
plt.plot(plot_df['Date'], plot_df['AQI_Actual_Historical'], 
         linestyle='-', markersize=3, label='Actual Historical AQI', color='blue', zorder=2)

plt.plot(plot_df['Date'], plot_df['AQI_Fit_Train'], 
         linestyle='--', markersize=2, label='Model Fit on Training Data', color='green', zorder=3)

plt.plot(plot_df['Date'], plot_df['AQI_Predicted_Future'], 
         linestyle='--', markersize=2, label='Predicted Future AQI (Apr 2025 - Dec 2025)', color='red', zorder=3)

plt.title('Daily AQI in Delhi: Historical Data, Model Fit, and Future Predictions', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('AQI', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.axvline(x=training_end_date, color='black', linestyle='--', label='Training End / Prediction Start')
plt.legend(fontsize=10)
plt.tight_layout()
plt.show()

print("\nAll processes and plots are complete.")
