In [1]:
import os
import numpy as np
import glob
import pandas as pd


In [2]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [3]:
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
DATA_RAW_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw')
DATA_PROCESSED_DIR = os.path.join(PROJECT_ROOT, 'data', 'processed')

os.makedirs(DATA_PROCESSED_DIR, exist_ok=True)

print('DATA_RAW_DIR:', DATA_RAW_DIR)
print('DATA_PROCESSED_DIR:', DATA_PROCESSED_DIR)

DATA_RAW_DIR: c:\Users\arjun\aqi-forecasting-india-ml-dl\data\raw
DATA_PROCESSED_DIR: c:\Users\arjun\aqi-forecasting-india-ml-dl\data\processed


In [4]:
CITY_FILES={
    "Delhi":"delhi_combined.csv",
    "Mumbai":"mumbai_combined.csv",
    "Bengaluru":"bengaluru_combined.csv",
}

In [5]:
POLUT_COL=['PM2.5','PM10','NO2','NH3','CO','SO2','O3']

def load_city(filepath:str,city_name:str)->pd.DataFrame:
    df=pd.read_csv(filepath)
    
    df["Timestamp"]=pd.to_datetime(df["Timestamp"],dayfirst=True,errors='coerce')
    df=df.dropna(subset=["Timestamp"]).copy()
    df=df.sort_values("Timestamp")

    if df.duplicated(subset=["Timestamp"]).any():
        df=df.groupby("Timestamp",as_index=False).mean(numeric_only=True)
    
    df=df.set_index("Timestamp").sort_index()

    #Grid enforcement
    full_idx=pd.date_range(start=df.index.min(),end=df.index.max(),freq='D')
    df=df.reindex(full_idx)
    df.index.name="Timestamp"

    df=df.reset_index()
    df["City"]=city_name


    keep = ["Timestamp", "City"] + POLUT_COL
    missing = [c for c in keep if c not in df.columns]
    if missing:
        raise ValueError(f"{os.path.basename(filepath)} missing required columns: {missing}")

    return df[keep].copy()


In [6]:
dfs=[]
for city, filename in CITY_FILES.items():
    fpath=os.path.join(DATA_RAW_DIR,filename)
    dfs.append(load_city(fpath,city))

df=pd.concat(dfs,ignore_index=True)

print("Final DataFrame shape:",df.shape)
df.head()

Final DataFrame shape: (5481, 9)


Unnamed: 0,Timestamp,City,PM2.5,PM10,NO2,NH3,CO,SO2,O3
0,2020-01-01,Delhi,420.68,507.6,105.21,63.96,2.26,6.3,9.49
1,2020-01-02,Delhi,364.73,480.09,79.38,51.28,2.87,9.69,8.03
2,2020-01-03,Delhi,227.72,309.23,57.22,44.29,2.69,11.04,8.18
3,2020-01-04,Delhi,243.86,323.75,60.99,44.88,1.94,12.61,11.72
4,2020-01-05,Delhi,161.66,222.79,56.22,38.35,1.91,11.03,12.37


In [7]:
for col in POLUT_COL:
    df[f'{col}_was_missing']=df[col].isna().astype(int)

df[['City','Timestamp']+[f"{c}_was_missing" for c in POLUT_COL]].head()

Unnamed: 0,City,Timestamp,PM2.5_was_missing,PM10_was_missing,NO2_was_missing,NH3_was_missing,CO_was_missing,SO2_was_missing,O3_was_missing
0,Delhi,2020-01-01,0,0,0,0,0,0,0
1,Delhi,2020-01-02,0,0,0,0,0,0,0
2,Delhi,2020-01-03,0,0,0,0,0,0,0
3,Delhi,2020-01-04,0,0,0,0,0,0,0
4,Delhi,2020-01-05,0,0,0,0,0,0,0


In [8]:
Gap=7

def interpolate_short_gaps_city(city_df:pd.DataFrame)->pd.DataFrame:
    city_df=city_df.sort_values("Timestamp").copy()
    city_df=city_df.set_index('Timestamp')

    for col in POLUT_COL:
        city_df[col]=city_df[col].interpolate(
            method='time',
            limit=Gap,
            limit_direction='both'
        )

        city_df[col]=city_df[col].fillna(method='ffill', limit=Gap)
        city_df[col]=city_df[col].fillna(method='bfill', limit=Gap)
    
    city_df=city_df.reset_index()
    return city_df

df=df.groupby('City',group_keys=False).apply(interpolate_short_gaps_city)
df.head()

  city_df[col]=city_df[col].fillna(method='ffill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='bfill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='ffill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='bfill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='ffill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='bfill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='ffill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='bfill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='ffill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='bfill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='ffill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='bfill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='ffill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='bfill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='ffill', limit=Gap)
  city_df[col]=city_df[col].fillna(method='bfill', limit=Gap)
  city_d

Unnamed: 0,Timestamp,City,PM2.5,PM10,NO2,NH3,CO,SO2,O3,PM2.5_was_missing,PM10_was_missing,NO2_was_missing,NH3_was_missing,CO_was_missing,SO2_was_missing,O3_was_missing
0,2020-01-01,Bengaluru,43.67,134.0,20.28,10.98,0.91,3.41,21.82,1,1,1,1,1,1,1
1,2020-01-02,Bengaluru,43.67,134.0,20.28,10.98,0.91,3.41,21.82,0,0,0,0,0,0,0
2,2020-01-03,Bengaluru,30.58,74.42,15.17,12.1,0.96,3.27,23.31,0,0,0,0,0,0,0
3,2020-01-04,Bengaluru,66.35,155.68,42.9,11.75,2.54,3.26,29.7,0,0,0,0,0,0,0
4,2020-01-05,Bengaluru,48.0,99.13,18.56,9.79,1.14,2.95,31.01,0,0,0,0,0,0,0


In [9]:
miss_seq=(
    df.groupby('City')[POLUT_COL]
      .apply(lambda x:(x.isna().mean()*100).round(2))
      .reset_index()
)

miss_seq

Unnamed: 0,City,PM2.5,PM10,NO2,NH3,CO,SO2,O3
0,Bengaluru,0.0,0.0,0.27,0.27,0.0,2.74,1.2
1,Delhi,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Mumbai,0.22,0.0,0.0,0.0,0.0,1.75,0.0


In [10]:
AQI_BREAKPOINTS=[
    (0,50),
    (51,100),
    (101,200),
    (201,300),
    (301,400),
    (401,500)
]

CPCB_BREAKPOINTS={
    'PM2.5':[
        (0,30),
        (31,60),
        (61,90),
        (91,120),
        (121,250),
        (251,500)
    ],
    'PM10':[
        (0,50),
        (51,100),
        (101,250),
        (251,350),
        (351,430),
        (431,600)
    ],
    'NO2':[
        (0,40),
        (41,80),
        (81,180),
        (181,280),
        (281,400),
        (401,1000)
    ],
    'NH3':[
        (0,200),
        (201,400),
        (401,800),
        (801,1200),
        (1201,1800),
        (1801,2000)
    ],
    'CO':[
        (0.0,1.0),
        (1.1,2.0),
        (2.1,10.0),
        (10.1,17.0),
        (17.1,34.0),
        (34.1,50.0)
    ],
    'SO2':[
        (0,40),
        (41,80),
        (81,380),
        (381,800),
        (801,1600),
        (1601,2000)
    ],
    'O3':[
        (0,50),
        (51,100),
        (101,168),
        (169,208),
        (209,748),
        (749,1000)
    ]
}

In [11]:
def compute_aqi_subindex(conc, pollutant: str):
    if pd.isna(conc):
        return np.nan

    if conc < 0:
        return np.nan

    c_bands = CPCB_BREAKPOINTS[pollutant]

    if conc > c_bands[-1][1]:
        return 500.0

    for (I_low, I_high), (C_low, C_high) in zip(AQI_BREAKPOINTS, c_bands):
        if C_low <= conc <= C_high:
            if C_high == C_low:
                return float(I_high)

            sub = ((I_high - I_low) / (C_high - C_low)) * (conc - C_low) + I_low

            return float(min(500.0, max(0.0, sub)))

    return np.nan    

In [12]:
sub_cols = []
for col in POLUT_COL:
    sc = f"{col}_sub"
    df[sc] = df[col].apply(lambda x: compute_aqi_subindex(x, col))
    sub_cols.append(sc)

df[['City','Timestamp']+sub_cols].head()

Unnamed: 0,City,Timestamp,PM2.5_sub,PM10_sub,NO2_sub,NH3_sub,CO_sub,SO2_sub,O3_sub
0,Bengaluru,2020-01-01,72.407931,122.926174,25.35,2.745,45.5,4.2625,21.82
1,Bengaluru,2020-01-02,72.407931,122.926174,25.35,2.745,45.5,4.2625,21.82
2,Bengaluru,2020-01-03,,74.42,18.9625,3.025,48.0,4.0875,23.31
3,Bengaluru,2020-01-04,119.263793,137.331007,53.387179,2.9375,106.513924,4.075,29.7
4,Bengaluru,2020-01-05,79.724138,99.13,23.2,2.4475,53.177778,3.6875,31.01


In [13]:
df["n_avail_sub"]=df[sub_cols].notna().sum(axis=1)
df['AQI']=df[sub_cols].max(axis=1).clip(0,500)


df[['City','Timestamp','AQI','n_avail_sub']].head()

Unnamed: 0,City,Timestamp,AQI,n_avail_sub
0,Bengaluru,2020-01-01,122.926174,7
1,Bengaluru,2020-01-02,122.926174,7
2,Bengaluru,2020-01-03,74.42,6
3,Bengaluru,2020-01-04,137.331007,7
4,Bengaluru,2020-01-05,99.13,7


In [14]:
aqi_smry=(
    df.groupby('City')['AQI']
      .agg(['count','min','median','max'])
      .reset_index()
)

aqi_missing_pct=(
    df.groupby('City')['AQI']
      .apply(lambda x:(x.isna().mean()*100).round(2))
      .reset_index(name='AQI_missing_%')
)

aqi_smry,aqi_missing_pct

(        City  count        min      median    max
 0  Bengaluru   1827  16.816667   80.730000  500.0
 1      Delhi   1827  37.183333  183.655034  500.0
 2     Mumbai   1827  14.762500  103.338792  500.0,
         City  AQI_missing_%
 0  Bengaluru            0.0
 1      Delhi            0.0
 2     Mumbai            0.0)

In [15]:
keep_cols=(
    ['Timestamp','City']
    +POLUT_COL
    +[f"{c}_was_missing" for c in POLUT_COL]
    +['AQI','n_avail_sub']
)

df_outp=df[keep_cols].copy()
out_path=os.path.join(DATA_PROCESSED_DIR,'aqi_processed_data.csv')
df_outp.to_csv(out_path,index=False)

print(f"Processed data saved to: {out_path}")
df_outp.head()

Processed data saved to: c:\Users\arjun\aqi-forecasting-india-ml-dl\data\processed\aqi_processed_data.csv


Unnamed: 0,Timestamp,City,PM2.5,PM10,NO2,NH3,CO,SO2,O3,PM2.5_was_missing,PM10_was_missing,NO2_was_missing,NH3_was_missing,CO_was_missing,SO2_was_missing,O3_was_missing,AQI,n_avail_sub
0,2020-01-01,Bengaluru,43.67,134.0,20.28,10.98,0.91,3.41,21.82,1,1,1,1,1,1,1,122.926174,7
1,2020-01-02,Bengaluru,43.67,134.0,20.28,10.98,0.91,3.41,21.82,0,0,0,0,0,0,0,122.926174,7
2,2020-01-03,Bengaluru,30.58,74.42,15.17,12.1,0.96,3.27,23.31,0,0,0,0,0,0,0,74.42,6
3,2020-01-04,Bengaluru,66.35,155.68,42.9,11.75,2.54,3.26,29.7,0,0,0,0,0,0,0,137.331007,7
4,2020-01-05,Bengaluru,48.0,99.13,18.56,9.79,1.14,2.95,31.01,0,0,0,0,0,0,0,99.13,7
