In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np
import os

# Folder containing cleaned city CSV files
data_folder = r"E:\Codes\Projects\ML\air_quality\data\cleaned_city_data"
os.makedirs("forecasts", exist_ok=True)
os.makedirs("plots", exist_ok=True)

# City list
cities = [
    'Ahmedabad', 'Aizawl', 'Amaravati', 'Amritsar', 'Bengaluru', 'Bhopal',
    'Brajrajnagar', 'Chandigarh', 'Chennai', 'Coimbatore', 'Delhi', 'Ernakulam',
    'Gurugram', 'Guwahati', 'Hyderabad', 'Jaipur', 'Jorapokhar', 'Kochi', 'Kolkata',
    'Lucknow', 'Mumbai', 'Patna', 'Shillong', 'Talcher', 'Thiruvananthapuram',
    'Visakhapatnam'
]

results = []

for city_name in cities:
    print(f"\n=== Processing {city_name} ===")
    file_path = os.path.join(data_folder, f"{city_name.replace(' ', '_')}.csv")

    if not os.path.exists(file_path):
        print(f"⚠️ File not found for {city_name}. Skipping.")
        continue

    # Load and clean
    df = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
    df['AQI'] = pd.to_numeric(df['AQI'], errors='coerce')
    df.dropna(subset=['AQI'], inplace=True)

    if df.empty:
        print(f"⚠️ No valid AQI data for {city_name}. Skipping.")
        continue

    # Train-test split
    train_size = int(len(df) * 0.8)
    train, test = df.iloc[:train_size], df.iloc[train_size:]

    # Train ARIMA model
    try:
        model = ARIMA(train['AQI'], order=(5, 1, 2))
        model_fit = model.fit()
    except Exception as e:
        print(f"⚠️ Model failed for {city_name}: {e}")
        continue

    # Forecast
    forecast = model_fit.forecast(steps=len(test))
    test['Predicted_AQI'] = forecast.values

    # RMSE
    rmse = np.sqrt(mean_squared_error(test['AQI'], test['Predicted_AQI']))
    results.append({"City": city_name, "RMSE": rmse})
    print(f"RMSE for {city_name}: {rmse:.2f}")

    # Plot
    plt.figure(figsize=(14, 6))
    plt.plot(train.index, train['AQI'], label='Training Data')
    plt.plot(test.index, test['AQI'], label='Actual AQI', color='blue')
    plt.plot(test.index, test['Predicted_AQI'], label='Predicted AQI', color='red')
    plt.title(f"AQI Prediction for {city_name} using ARIMA")
    plt.xlabel("Date")
    plt.ylabel("AQI")
    plt.legend()
    plt.savefig(f"plots/{city_name.replace(' ', '_')}_arima.png")
    plt.close()

# Save results
if results:
    results_df = pd.DataFrame(results).sort_values(by="RMSE")
    print("\n=== Model Performance Summary ===")
    print(results_df)
    results_df.to_csv("forecasts/ARIMA_Model_Results.csv", index=False)
else:
    print("No cities processed successfully. Check file paths.")



=== Processing Ahmedabad ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Ahmedabad: 261.69

=== Processing Aizawl ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Aizawl: 5.04

=== Processing Amaravati ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Amaravati: 68.42

=== Processing Amritsar ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RMSE for Amritsar: 113.92

=== Processing Bengaluru ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Bengaluru: 36.14

=== Processing Bhopal ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Bhopal: 34.42

=== Processing Brajrajnagar ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Brajrajnagar: 61.50

=== Processing Chandigarh ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Chandigarh: 26.10

=== Processing Chennai ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Chennai: 37.04

=== Processing Coimbatore ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RMSE for Coimbatore: 37.36

=== Processing Delhi ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Delhi: 114.04

=== Processing Ernakulam ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


RMSE for Ernakulam: 8.45

=== Processing Gurugram ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


RMSE for Gurugram: 129.00

=== Processing Guwahati ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RMSE for Guwahati: 141.92

=== Processing Hyderabad ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RMSE for Hyderabad: 46.91

=== Processing Jaipur ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Jaipur: 52.35

=== Processing Jorapokhar ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RMSE for Jorapokhar: 53.52

=== Processing Kochi ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RMSE for Kochi: 12.67

=== Processing Kolkata ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RMSE for Kolkata: 129.25

=== Processing Lucknow ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RMSE for Lucknow: 97.93

=== Processing Mumbai ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


RMSE for Mumbai: 54.06

=== Processing Patna ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RMSE for Patna: 118.41

=== Processing Shillong ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RMSE for Shillong: 12.17

=== Processing Talcher ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Talcher: 170.18

=== Processing Thiruvananthapuram ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Thiruvananthapuram: 23.40

=== Processing Visakhapatnam ===


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_AQI'] = forecast.values


RMSE for Visakhapatnam: 47.17

=== Model Performance Summary ===
                  City        RMSE
1               Aizawl    5.039090
11           Ernakulam    8.454665
22            Shillong   12.169781
17               Kochi   12.669503
24  Thiruvananthapuram   23.395951
7           Chandigarh   26.102166
5               Bhopal   34.415107
4            Bengaluru   36.140913
8              Chennai   37.043292
9           Coimbatore   37.358912
14           Hyderabad   46.912912
25       Visakhapatnam   47.170846
15              Jaipur   52.350638
16          Jorapokhar   53.518665
20              Mumbai   54.059539
6         Brajrajnagar   61.501030
2            Amaravati   68.421004
19             Lucknow   97.929761
3             Amritsar  113.924561
10               Delhi  114.036240
21               Patna  118.413296
12            Gurugram  129.004862
18             Kolkata  129.250794
13            Guwahati  141.920186
23             Talcher  170.178584
0            Ahmedabad  2