# Week 1 — Clean-Air Companion
**Goal:** Load Mumbai hourly AQI data, clean, compute AQI (CPCB), identify safe/unsafe hours, perform EDA, and export a cleaned 30-day sample.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset (city_hour.csv)
file_path = "city_hour.csv"  # adjust path if needed
df = pd.read_csv(file_path, low_memory=False)

# Convert datetime
df['Datetime'] = pd.to_datetime(df['Datetime'], errors='coerce')

# Filter Mumbai
df_mumbai = df[df['City'].str.lower() == "mumbai"].copy()
df_mumbai = df_mumbai.set_index('Datetime').sort_index()

# Select pollutants
pollutants = ['PM2.5','PM10','NO2','O3','CO','SO2']
df_mumbai = df_mumbai[pollutants]

# Interpolate missing values
df_clean = df_mumbai.interpolate(method='time', limit=6)
df_clean.head()


In [None]:

# Define CPCB breakpoints
bp = {
    'PM2.5': [(0,30,0,50),(31,60,51,100),(61,90,101,200),
              (91,120,201,300),(121,250,301,400),(251,5000,401,500)],
    'PM10': [(0,50,0,50),(51,100,51,100),(101,250,101,200),
             (251,350,201,300),(351,430,301,400),(431,10000,401,500)]
}

def sub_index(conc, pollutant):
    if pd.isna(conc) or pollutant not in bp:
        return None
    for (bp_lo, bp_hi, i_lo, i_hi) in bp[pollutant]:
        if conc >= bp_lo and conc <= bp_hi:
            return ((i_hi - i_lo)/(bp_hi - bp_lo))*(conc - bp_lo) + i_lo
    return None

# Compute sub-indices & AQI
df_clean['pm25_si'] = df_clean['PM2.5'].apply(lambda x: sub_index(x,'PM2.5'))
df_clean['pm10_si'] = df_clean['PM10'].apply(lambda x: sub_index(x,'PM10'))
df_clean['AQI'] = df_clean[['pm25_si','pm10_si']].max(axis=1, skipna=True)

def aqi_category(aqi):
    if pd.isna(aqi): return None
    if aqi <= 50: return "Good"
    elif aqi <= 100: return "Satisfactory"
    elif aqi <= 200: return "Moderate"
    elif aqi <= 300: return "Poor"
    elif aqi <= 400: return "Very Poor"
    else: return "Severe"

df_clean['AQI_category'] = df_clean['AQI'].apply(aqi_category)
df_clean['is_safe_hour'] = df_clean['AQI'] <= 100

df_clean[['PM2.5','PM10','AQI','AQI_category','is_safe_hour']].head()


In [None]:

# Add time features
df_clean['hour'] = df_clean.index.hour
df_clean['weekday'] = df_clean.index.weekday

# Export last 30 days
sample = df_clean.tail(24*30).copy()
sample.to_csv("sample_data_cleaned.csv", index=True)
print("Exported sample_data_cleaned.csv with", len(sample), "rows")


In [None]:

# EDA plots
sns.set(style="whitegrid")
plt.figure(figsize=(14,4))
df_clean['AQI'].plot(title="AQI over time (Mumbai)")
plt.ylabel("AQI")
plt.show()

plt.figure(figsize=(8,4))
sns.histplot(df_clean['AQI'].dropna(), bins=40, kde=True)
plt.title("Distribution of AQI (Mumbai)")
plt.show()

plt.figure(figsize=(8,4))
diurnal = df_clean.groupby('hour')['AQI'].mean()
diurnal.plot(marker='o')
plt.title("Average AQI by Hour of Day")
plt.xlabel("Hour")
plt.ylabel("AQI")
plt.show()
