In [2]:
# Import Libraries
import pandas as pd
import numpy as np

In [4]:
# Load Data
city_day = pd.read_csv('city_day.csv')
station_day = pd.read_csv('station_day.csv')

In [6]:
# Handle Missing Values
pollutants = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 
              'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']

for col in pollutants:
    city_day[col] = city_day[col].fillna(city_day[col].mean())
    station_day[col] = station_day[col].fillna(station_day[col].mean())

city_day['AQI'] = city_day['AQI'].interpolate()
station_day['AQI'] = station_day['AQI'].interpolate()

In [8]:
# Date Conversion
city_day['Date'] = pd.to_datetime(city_day['Date'])
station_day['Date'] = pd.to_datetime(station_day['Date'])

In [10]:
# Feature Engineering
def assign_season(month):
    if month in [3,4,5]:
        return 'Summer'
    elif month in [6,7,8]:
        return 'Monsoon'
    elif month in [9,10,11]:
        return 'Autumn'
    else:
        return 'Winter'

for df in [city_day, station_day]:
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayofWeek'] = df['Date'].dt.dayofweek
    df['Quarter'] = df['Date'].dt.quarter
    df['Avg_Pollutant'] = df[pollutants].mean(axis=1)
    df['Season'] = df['Month'].apply(assign_season)

In [12]:
# Encode Categorical Variables
city_day = pd.get_dummies(city_day, columns=['City', 'Season'], drop_first=True)
station_day = pd.get_dummies(station_day, columns=['StationId', 'Season'], drop_first=True)

In [14]:
# Save Preprocessed Files
city_day.to_csv('preprocessed_city_day.csv', index=False)
station_day.to_csv('preprocessed_station_day.csv', index=False)

print(" Preprocessing complete. Preprocessed CSV files saved.")

 Preprocessing complete. Preprocessed CSV files saved.
