<a href="https://colab.research.google.com/github/AbhayBhise/Week1/blob/main/air_quality_project_week1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Air Quality Prediction – Week 1**
Environmental Monitoring & Pollution Control

## Overview
This notebook downloads the dataset directly from Kaggle (no Google Drive needed),
performs initial exploration, and sets up the project for further modeling.


In [None]:

# 1. Kaggle API Setup

from google.colab import files
import os, shutil

print("Upload your kaggle.json file:")
uploaded = files.upload()


os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
shutil.move("kaggle.json", os.path.expanduser("~/.kaggle/kaggle.json"))
os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)

print("Kaggle API is ready.")


Upload your kaggle.json file:


Saving kaggle.json to kaggle.json
Kaggle API is ready.


In [None]:

# 2. Download Dataset

!pip install -q kaggle

DATASET_SLUG = "rohanrao/air-quality-data-in-india"
DOWNLOAD_DIR = "/content/data"

os.makedirs(DOWNLOAD_DIR, exist_ok=True)
!kaggle datasets download -d {DATASET_SLUG} -p {DOWNLOAD_DIR} --force
!unzip -o "{DOWNLOAD_DIR}/*.zip" -d {DOWNLOAD_DIR} >/dev/null
!ls -lh {DOWNLOAD_DIR}


Dataset URL: https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india
License(s): CC0-1.0
Downloading air-quality-data-in-india.zip to /content/data
  0% 0.00/72.9M [00:00<?, ?B/s]
100% 72.9M/72.9M [00:00<00:00, 817MB/s]
total 356M
-rw-r--r-- 1 root root  73M Jul 28  2020 air-quality-data-in-india.zip
-rw-r--r-- 1 root root 2.5M Jul 28  2020 city_day.csv
-rw-r--r-- 1 root root  63M Jul 28  2020 city_hour.csv
-rw-r--r-- 1 root root 8.3M Jul 28  2020 station_day.csv
-rw-r--r-- 1 root root 210M Jul 28  2020 station_hour.csv
-rw-r--r-- 1 root root  15K Jul 28  2020 stations.csv


In [None]:

# 3. Load Dataset
import pandas as pd
import glob

csv_files = glob.glob(f"{DOWNLOAD_DIR}/*.csv")
print("Available CSV files:", csv_files)

# Pick main file
df = pd.read_csv(csv_files[0])
df.head()


Available CSV files: ['/content/data/station_hour.csv', '/content/data/city_day.csv', '/content/data/city_hour.csv', '/content/data/stations.csv', '/content/data/station_day.csv']


  df = pd.read_csv(csv_files[0])


Unnamed: 0,StationId,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,AP001,2017-11-24 17:00:00,60.5,98.0,2.35,30.8,18.25,8.5,0.1,11.85,126.4,0.1,6.1,0.1,,
1,AP001,2017-11-24 18:00:00,65.5,111.25,2.7,24.2,15.07,9.77,0.1,13.17,117.12,0.1,6.25,0.15,,
2,AP001,2017-11-24 19:00:00,80.0,132.0,2.1,25.18,15.15,12.02,0.1,12.08,98.98,0.2,5.98,0.18,,
3,AP001,2017-11-24 20:00:00,81.5,133.25,1.95,16.25,10.23,11.58,0.1,10.47,112.2,0.2,6.72,0.1,,
4,AP001,2017-11-24 21:00:00,75.25,116.0,1.43,17.48,10.43,12.03,0.1,9.12,106.35,0.2,5.75,0.08,,


In [None]:

# 4. Basic Exploration
df.info()
df.describe().T
df.isnull().sum().sort_values(ascending=False).head(20)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2589083 entries, 0 to 2589082
Data columns (total 16 columns):
 #   Column      Dtype  
---  ------      -----  
 0   StationId   object 
 1   Datetime    object 
 2   PM2.5       float64
 3   PM10        float64
 4   NO          float64
 5   NO2         float64
 6   NOx         float64
 7   NH3         float64
 8   CO          float64
 9   SO2         float64
 10  O3          float64
 11  Benzene     float64
 12  Toluene     float64
 13  Xylene      float64
 14  AQI         float64
 15  AQI_Bucket  object 
dtypes: float64(13), object(3)
memory usage: 316.1+ MB


Unnamed: 0,0
Xylene,2075104
NH3,1236618
PM10,1119252
Toluene,1042366
Benzene,861579
SO2,742737
O3,725973
PM2.5,647689
AQI_Bucket,570190
AQI,570190


## **Next Steps**
- Clean missing values
- Create AQI calculation features
- Train ML model
- Add visualizations (trends, seasonal effects)


In [5]:


import pandas as pd
import numpy as np
import os


# Loading the main dataset
csv_file = "/content/city_day.csv"
df = pd.read_csv(csv_file)

print("Initial shape:", df.shape)
df.head()


Initial shape: (29531, 16)


Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [6]:

# Data Cleaning


# Droping completely empty columns
df.dropna(axis=1, how='all', inplace=True)

# Handling missing values
missing_perc = df.isnull().mean() * 100
print("Missing value percentage per column:\n", missing_perc)

# Define essential columns
essential_cols = ['PM2.5','PM10','NO','NO2','NOx','NH3','CO','SO2','O3','Benzene','Toluene','Xylene','AQI']
df.dropna(subset=essential_cols, how='all', inplace=True)

# Fill missing pollutant values with median per city
df[essential_cols] = df.groupby('City')[essential_cols].transform(lambda x: x.fillna(x.median()))


Missing value percentage per column:
 City           0.000000
Date           0.000000
PM2.5         15.570079
PM10          37.723071
NO            12.129626
NO2           12.139785
NOx           14.171549
NH3           34.973418
CO             6.972334
SO2           13.050692
O3            13.619586
Benzene       19.041008
Toluene       27.229014
Xylene        61.322001
AQI           15.851139
AQI_Bucket    15.851139
dtype: float64


In [7]:
# Convert Data Types


# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Remove rows with invalid dates
df.dropna(subset=['Date'], inplace=True)


In [8]:
# Feature Engineering


# Extract date features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Weekday'] = df['Date'].dt.day_name()

# Fill AQI or AQI_Bucket if missing
if 'AQI' in df.columns:
    df['AQI'] = df['AQI'].fillna(df['AQI'].median())
if 'AQI_Bucket' in df.columns:
    df['AQI_Bucket'] = df['AQI_Bucket'].fillna('Unknown')


In [9]:
# Outlier Handling


# IQR to find extreme values
for col in essential_cols:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        df[col] = np.clip(df[col], lower_limit, upper_limit)


In [11]:
# Final Cleaned Dataset


print("Final shape after preprocessing:", df.shape)
print("Columns:", df.columns.tolist())

# Saving cleaned dataset
cleaned_path = "cleaned_air_quality_day.csv"

df.to_csv(cleaned_path, index=False)
print(f"Cleaned dataset saved to: {cleaned_path}")


Final shape after preprocessing: (28157, 20)
Columns: ['City', 'Date', 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket', 'Year', 'Month', 'Day', 'Weekday']
Cleaned dataset saved to: cleaned_air_quality_day.csv
