STEP 1: DATA COLLECTION AND CLEANING

In [5]:
# Step 1: Import libraries

import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

# Step 2: Dataset Path

DATASET_SRC = '/kaggle/input/time-series-air-quality-data-of-india-2010-2023'

In [6]:
# List all files inside the dataset folder
files = os.listdir(DATASET_SRC)
print("Total files available:", len(files))
print("First 10 files:", files[:10])  # show first 10 file names


Total files available: 454
First 10 files: ['DL023.csv', 'WB005.csv', 'MH018.csv', 'UP017.csv', 'BR008.csv', 'BR003.csv', 'MP016.csv', 'KA035.csv', 'CH001.csv', 'PY001.csv']


In [7]:
# Load one sample file
sample_file = os.path.join(DATASET_SRC, "DL023.csv")
df = pd.read_csv(sample_file)

# Check shape and first few rows
print("Shape:", df.shape)
df.head()

Shape: (45230, 22)


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,From Date,To Date,PM2.5 (ug/m3),PM10 (ug/m3),NO (ug/m3),NO2 (ug/m3),NOx (ppb),NH3 (ug/m3),SO2 (ug/m3),CO (ug/m3),...,Toluene (),Temp (degree C),RH (%),WS (m/s),WD (degree),SR (W/mt2),BP (mmHg),VWS (m/s),AT (degree C),RF (mm)
0,2018-02-01 10:00:00,2018-02-01 11:00:00,322.0,487.0,4.53,26.33,18.72,24.92,11.06,0.58,...,,,36.6,1.86,298.6,601.0,994.0,0.0,22.16,
1,2018-02-01 11:00:00,2018-02-01 12:00:00,245.92,427.42,5.96,26.08,32.14,37.77,20.26,0.94,...,,,32.75,1.98,296.42,602.0,994.0,0.0,24.08,
2,2018-02-01 12:00:00,2018-02-01 13:00:00,176.67,368.83,2.7,15.93,18.62,38.67,12.48,0.73,...,,,34.17,1.95,297.0,602.92,992.92,0.0,25.52,
3,2018-02-01 13:00:00,2018-02-01 14:00:00,149.0,333.75,1.33,11.37,23.08,24.69,4.28,0.61,...,,,32.92,2.58,310.58,603.0,991.83,0.0,26.36,
4,2018-02-01 14:00:00,2018-02-01 15:00:00,113.08,273.25,1.22,15.52,33.15,7.96,0.53,0.52,...,,,31.58,2.8,318.08,603.0,990.92,0.0,26.77,


In [8]:
df_states = pd.read_csv(f'{DATASET_SRC}/stations_info.csv')
df_states.drop(columns=['agency', 'station_location', 'start_month'], inplace=True)
df_states.head()

Unnamed: 0,file_name,state,city,start_month_num,start_year
0,AP001,Andhra Pradesh,Tirupati,7,2016
1,AP002,Andhra Pradesh,Vijayawada,5,2017
2,AP003,Andhra Pradesh,Visakhapatnam,7,2017
3,AP004,Andhra Pradesh,Rajamahendravaram,9,2017
4,AP005,Andhra Pradesh,Amaravati,11,2017


In [9]:
unique_states = df_states['state'].unique()
unique_states

array(['Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar',
       'Chhattisgarh', 'Chandigarh', 'Delhi', 'Gujarat',
       'Himachal Pradesh', 'Haryana', 'Jharkhand', 'Jammu and Kashmir',
       'Karnataka', 'Kerala', 'Maharashtra', 'Meghalaya', 'Manipur',
       'Madhya Pradesh', 'Mizoram', 'Nagaland', 'Odisha', 'Punjab',
       'Puducherry', 'Rajasthan', 'Sikkim', 'Telangana', 'Tamil Nadu',
       'Tripura', 'Uttarakhand', 'Uttar Pradesh', 'West Bengal'],
      dtype=object)

In [10]:
def combine_state_df(state_name):
    '''
    Combine all state files into a single dataframe and attaching the city information.

    Parameters
    ----------
        state_name (str): The name of the state

    Return
    ------
        df (DataFrame): The combined dataframe from all files of a specific state
    '''
    
    state_code = df_states[df_states['state'] == state_name]['file_name'].iloc[0][:2]
    state_files = glob.glob(f'{DATASET_SRC}/{state_code}*.csv')
    print(f'Combining a total of {len(state_files)} files...\n')

    combined_df = []
    
    for state_file in state_files:
        file_name = state_file.split(f'{DATASET_SRC}/')[1][0:-4]
        file_df = pd.read_csv(state_file)
        file_df['city'] = df_states[df_states['file_name'] == file_name]['city'].values[0]
        file_df['city'] = file_df['city'].astype('string')
        combined_df.append(file_df)
        
    return pd.concat(combined_df)

In [11]:
df = combine_state_df('Delhi')
df.info()

Combining a total of 40 files...

<class 'pandas.core.frame.DataFrame'>
Index: 2796171 entries, 0 to 113961
Data columns (total 60 columns):
 #   Column               Dtype  
---  ------               -----  
 0   From Date            object 
 1   To Date              object 
 2   PM2.5 (ug/m3)        float64
 3   PM10 (ug/m3)         float64
 4   NO (ug/m3)           float64
 5   NO2 (ug/m3)          float64
 6   NOx (ppb)            float64
 7   NH3 (ug/m3)          float64
 8   SO2 (ug/m3)          float64
 9   CO (ug/m3)           float64
 10  Ozone (ug/m3)        float64
 11  Benzene (ug/m3)      float64
 12  Toluene ()           float64
 13  Temp (degree C)      float64
 14  RH (%)               float64
 15  WS (m/s)             float64
 16  WD (degree)          float64
 17  SR (W/mt2)           float64
 18  BP (mmHg)            float64
 19  VWS (m/s)            float64
 20  AT (degree C)        float64
 21  RF (mm)              float64
 22  city                 string 
 23  CO (

In [12]:
# combining all states 
all_states_df = []

for state in unique_states:
    try:
        state_df = combine_state_df(state)   # use your existing function
        all_states_df.append(state_df)
        print(f"{state} combined successfully with shape {state_df.shape}")
    except Exception as e:
        print(f"Error combining {state}: {e}")

# Concatenate into one master DataFrame
master_df = pd.concat(all_states_df, ignore_index=True)
print("Master dataset shape:", master_df.shape)

Combining a total of 10 files...

Andhra Pradesh combined successfully with shape (272217, 29)
Combining a total of 1 files...

Arunachal Pradesh combined successfully with shape (17655, 26)
Combining a total of 9 files...

Assam combined successfully with shape (75171, 26)
Combining a total of 35 files...

Bihar combined successfully with shape (741917, 32)
Combining a total of 14 files...

Chhattisgarh combined successfully with shape (62952, 28)
Combining a total of 3 files...

Chandigarh combined successfully with shape (56647, 26)
Combining a total of 40 files...

Delhi combined successfully with shape (2796171, 60)
Combining a total of 17 files...

Gujarat combined successfully with shape (442880, 31)
Combining a total of 1 files...

Himachal Pradesh combined successfully with shape (9247, 24)
Combining a total of 30 files...

Haryana combined successfully with shape (1228631, 51)
Combining a total of 2 files...

Jharkhand combined successfully with shape (52638, 21)
Combining a 

In [14]:
# remove the unnecessary columns

useful_cols = ['From Date', 'PM2.5 (ug/m3)', 'PM10 (ug/m3)', 
               'NO2 (ug/m3)', 'SO2 (ug/m3)', 'Ozone (ug/m3)', 'city']

master_df = master_df[useful_cols]

In [15]:
# print all columns
print(master_df.columns.tolist())


['From Date', 'PM2.5 (ug/m3)', 'PM10 (ug/m3)', 'NO2 (ug/m3)', 'SO2 (ug/m3)', 'Ozone (ug/m3)', 'city']


In [16]:
# for time series analysis , we have to use " from date" column as the main timestamp

# for that rename the "from date" to "date"

master_df = master_df.rename(columns = {"From Date": "date"})

In [22]:
# print all columns
print(master_df.columns.tolist())

['date', 'PM2.5 (ug/m3)', 'PM10 (ug/m3)', 'NO2 (ug/m3)', 'SO2 (ug/m3)', 'Ozone (ug/m3)', 'city']


In [25]:
master_df['date'] = pd.to_datetime(master_df['date'], errors='coerce')

In [27]:
# print column type

print(master_df['date'].dtype)

datetime64[ns]


In [28]:
# Feature engineering to extract time -based features 'from date' column
master_df['year']= master_df['date'].dt.year
master_df['year']

0           2017
1           2017
2           2017
3           2017
4           2017
            ... 
14295658    2023
14295659    2023
14295660    2023
14295661    2023
14295662    2023
Name: year, Length: 10357694, dtype: int32

In [29]:
master_df['month'] = master_df['date'].dt.month

In [32]:
master_df['day_of_week'] = master_df['date'].dt.dayofweek
master_df['day_of_week']

0           4
1           4
2           4
3           4
4           4
           ..
14295658    4
14295659    4
14295660    4
14295661    4
14295662    4
Name: day_of_week, Length: 10357694, dtype: int32

In [33]:
# handling missing values
# remove pm2.5 missing rows
master_df =  master_df.dropna(subset=['PM2.5 (ug/m3)'])

In [34]:
# fill other missing values 
master_df[['PM10 (ug/m3)', 'NO2 (ug/m3)', 'SO2 (ug/m3)', 'Ozone (ug/m3)']] = (
    master_df[['PM10 (ug/m3)', 'NO2 (ug/m3)', 'SO2 (ug/m3)', 'Ozone (ug/m3)']].fillna(method='ffill')
)

  master_df[['PM10 (ug/m3)', 'NO2 (ug/m3)', 'SO2 (ug/m3)', 'Ozone (ug/m3)']].fillna(method='ffill')


In [37]:
master_df.head()

Unnamed: 0,date,PM2.5 (ug/m3),PM10 (ug/m3),NO2 (ug/m3),SO2 (ug/m3),Ozone (ug/m3),city,year,month,day_of_week
0,2017-11-24 16:00:00,60.5,98.0,30.8,11.85,126.4,Amaravati,2017,11,4
1,2017-11-24 17:00:00,65.5,111.25,24.2,13.17,117.12,Amaravati,2017,11,4
2,2017-11-24 18:00:00,80.0,132.0,25.18,12.08,98.98,Amaravati,2017,11,4
3,2017-11-24 19:00:00,81.5,133.25,16.25,10.47,112.2,Amaravati,2017,11,4
4,2017-11-24 20:00:00,75.25,116.0,17.48,9.12,106.35,Amaravati,2017,11,4


In [41]:
# creating lag feature because it is useful in time series : as it helps in predicting today's PM2.5 using previous day
master_df['PM2.5_prev_day'] = master_df.groupby('city')['PM2.5 (ug/m3)'].shift(1)
master_df['PM2.5_prev_day'] 

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


0             NaN
1           60.50
2           65.50
3           80.00
4           81.50
            ...  
14295658    33.68
14295659    33.96
14295660    30.73
14295661    26.22
14295662    25.40
Name: PM2.5_prev_day, Length: 10357694, dtype: float64

In [43]:
#save the cleaned dataset

master_df.to_csv('/kaggle/working/air_quality_clean.csv', index=False)

STEP 2 : EXPLORATORY DATA ANALYSIS


In [None]:
# 1. temporal analysis(Yearly & Monthly Trends)
# average PM2.5 per year

avg_pm25_year = master_df.groupby('year')['PM2.5 (ug/m3)'].mean().reset_index()

plt.figure(figsize=(10,5))
sns.lineplot(data=avg_pm25_year, x='year', y='PM2.5 (ug/m3)', marker='o')
plt.title('Average PM2.5 per Year (India)')
plt.ylabel('PM2.5 (ug/m3)')
plt.xlabel('Year')
plt.show()