In [2]:
# Week 1: Dataset Exploration
import pandas as pd

# Load main dataset
df = pd.read_csv("data/city_day.csv")

# Preview top rows
print("üìä First 5 rows:")
display(df.head())

# Basic info
print("\nüìò Dataset Info:")
df.info()



üìä First 5 rows:


Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,



üìò Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB


In [3]:
# Summary statistics
print("\nüìà Statistical Summary:")
display(df.describe())

# Missing values
print("\n‚ùå Missing Values:")
display(df.isnull().sum())

# Duplicate check
duplicates = df.duplicated().sum()
print(f"\nüßæ Number of duplicate rows: {duplicates}")



üìà Statistical Summary:


Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI
count,24933.0,18391.0,25949.0,25946.0,25346.0,19203.0,27472.0,25677.0,25509.0,23908.0,21490.0,11422.0,24850.0
mean,67.450578,118.127103,17.57473,28.560659,32.309123,23.483476,2.248598,14.531977,34.49143,3.28084,8.700972,3.070128,166.463581
std,64.661449,90.60511,22.785846,24.474746,31.646011,25.684275,6.962884,18.133775,21.694928,15.811136,19.969164,6.323247,140.696585
min,0.04,0.01,0.02,0.01,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.0,13.0
25%,28.82,56.255,5.63,11.75,12.82,8.58,0.51,5.67,18.86,0.12,0.6,0.14,81.0
50%,48.57,95.68,9.89,21.69,23.52,15.85,0.89,9.16,30.84,1.07,2.97,0.98,118.0
75%,80.59,149.745,19.95,37.62,40.1275,30.02,1.45,15.22,45.57,3.08,9.15,3.35,208.0
max,949.99,1000.0,390.68,362.21,467.63,352.89,175.81,193.86,257.73,455.03,454.85,170.37,2049.0



‚ùå Missing Values:


City              0
Date              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64


üßæ Number of duplicate rows: 0


In [1]:
# --- Phase 2: Data Preprocessing & Model Training ---

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [2]:
# 1Ô∏è‚É£ Load Dataset
df = pd.read_csv("data/city_day.csv")
print("‚úÖ Data loaded successfully!")

‚úÖ Data loaded successfully!


In [3]:
# 2Ô∏è‚É£ Clean Data
df = df.drop(columns=['City', 'Date', 'AQI_Bucket'], errors='ignore')
df = df.fillna(df.median(numeric_only=True))
print("‚úÖ Missing values handled.")

‚úÖ Missing values handled.


In [4]:
# 3Ô∏è‚É£ Define Features & Target
X = df.drop(columns=['AQI'], errors='ignore')
y = df['AQI']
mask = ~y.isna()
X, y = X[mask], y[mask]


In [5]:
# 4Ô∏è‚É£ Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5Ô∏è‚É£ Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# 6Ô∏è‚É£ Train Models
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=100, random_state=42)
lr.fit(X_train_scaled, y_train)
rf.fit(X_train, y_train)


In [7]:
# 7Ô∏è‚É£ Evaluate Models
def evaluate(model, X_t, y_t):
    preds = model.predict(X_t)
    mae = mean_absolute_error(y_t, preds)
    mse = mean_squared_error(y_t, preds)
    r2 = r2_score(y_t, preds)
    return mae, mse, r2

lr_mae, lr_mse, lr_r2 = evaluate(lr, X_test_scaled, y_test)
rf_mae, rf_mse, rf_r2 = evaluate(rf, X_test, y_test)

print("\nüìä MODEL PERFORMANCE")
print(f"Linear Regression: MAE={lr_mae:.2f}, MSE={lr_mse:.2f}, R2={lr_r2:.3f}")
print(f"Random Forest:     MAE={rf_mae:.2f}, MSE={rf_mse:.2f}, R2={rf_r2:.3f}")


üìä MODEL PERFORMANCE
Linear Regression: MAE=29.91, MSE=2965.68, R2=0.807
Random Forest:     MAE=19.96, MSE=1842.74, R2=0.880


In [8]:
# 8Ô∏è‚É£ Save the Best Model
best_model = rf if rf_r2 > lr_r2 else lr
joblib.dump(best_model, "air_quality_model.pkl")
print(f"\nüèÜ Best model saved as air_quality_model.pkl ({type(best_model).__name__})")


üèÜ Best model saved as air_quality_model.pkl (RandomForestRegressor)
