In [1]:
pip install pandas matplotlib seaborn scikit-learn



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/Cellar/jupyterlab/4.3.5/libexec/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [36]:
import pandas as pd
import os
from pathlib import Path

# 1. Set directory path containing CSV files
data_dir = Path("/Users/dhruviyer/AQIEDA/data")  # Replace with actual path

# 2. Initialize empty list to store DataFrames
all_data = []

# 3. Process each CSV file
for file_path in data_dir.glob("*_combined.csv"):
    # Extract city name from filename
    city_name = file_path.stem.split('_')[0].capitalize()
    
    # Read CSV and add city column
    df = pd.read_csv(file_path)
    df.insert(1, 'City', city_name)  # Insert after first column
    
    # Add to list
    all_data.append(df)

# 4. Combine all DataFrames
combined_df = pd.concat(all_data, ignore_index=True)

# 5. Save merged data
combined_df.to_csv("all_cities_aqi_combined.csv", index=False)
print("Combined data saved successfully!")

Combined data saved successfully!


In [41]:

print(df.head(3))  # Show first 3 rows
print("\nColumns:", df.columns.tolist())


    Timestamp   City              Location   PM2.5    PM10     NO2    NH3  \
0  01-01-2020  Delhi  Delhi - Punjabi Bagh  420.68  507.60  105.21  63.96   
1  02-01-2020  Delhi  Delhi - Punjabi Bagh  364.73  480.09   79.38  51.28   
2  03-01-2020  Delhi  Delhi - Punjabi Bagh  227.72  309.23   57.22  44.29   

     SO2    CO    O3  
0   6.30  2.26  9.49  
1   9.69  2.87  8.03  
2  11.04  2.69  8.18  

Columns: ['Timestamp', 'City', 'Location', 'PM2.5', 'PM10', 'NO2', 'NH3', 'SO2', 'CO', 'O3']


In [42]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], dayfirst=True).dt.strftime('%Y-%m-%d')


In [43]:
print(f"Duplicate rows: {df.duplicated().sum()}")


Duplicate rows: 0


In [53]:
print(df.head(3))  # Show first 3 rows
print("\nColumns:", df.columns.tolist())

    Timestamp   City   PM2.5    PM10     NO2    NH3    SO2    CO    O3  \
0  2020-01-01  Delhi  420.68  507.60  105.21  63.96   6.30  2.26  9.49   
1  2020-01-02  Delhi  364.73  480.09   79.38  51.28   9.69  2.87  8.03   
2  2020-01-03  Delhi  227.72  309.23   57.22  44.29  11.04  2.69  8.18   

           Area PM2.5_Status  
0  Punjabi Bagh       Review  
1  Punjabi Bagh       Review  
2  Punjabi Bagh           OK  

Columns: ['Timestamp', 'City', 'PM2.5', 'PM10', 'NO2', 'NH3', 'SO2', 'CO', 'O3', 'Area', 'PM2.5_Status']


In [46]:
import numpy as np

In [48]:
df['PM2.5_Status'] = np.where(df['PM2.5'] > 500, 'Invalid', 
                             np.where(df['PM2.5'] > 300, 'Review', 'OK'))


In [49]:
print(df.isnull().sum())


Timestamp        0
City             0
PM2.5           14
PM10            20
NO2             21
NH3             23
SO2             43
CO              27
O3              20
Area             0
PM2.5_Status     0
dtype: int64


In [54]:
print(df.describe())



             PM2.5         PM10          NO2          NH3          SO2  \
count  1813.000000  1807.000000  1806.000000  1804.000000  1784.000000   
mean    114.174567   212.470714    47.171755    23.287223    17.082618   
std      90.995321   120.481871    22.915488    10.500778    12.052564   
min      12.200000    27.100000     0.220000     0.100000     0.680000   
25%      49.280000   117.650000    29.220000    15.375000    10.177500   
50%      81.440000   188.220000    44.485000    22.805000    15.275000   
75%     151.780000   283.770000    61.325000    29.505000    20.762500   
max     676.100000   790.750000   204.200000    92.830000   127.410000   

                CO           O3  
count  1800.000000  1807.000000  
mean      1.223433    28.134715  
std       0.767546    20.265792  
min       0.070000     2.270000  
25%       0.707500    13.265000  
50%       0.985000    21.560000  
75%       1.540000    36.240000  
max       6.330000   122.100000  


In [55]:
df.groupby('Area')['PM2.5'].mean().sort_values(ascending=False)


Area
Punjabi Bagh    114.174567
Name: PM2.5, dtype: float64

In [56]:
print(df.isnull().sum())


Timestamp        0
City             0
PM2.5           14
PM10            20
NO2             21
NH3             23
SO2             43
CO              27
O3              20
Area             0
PM2.5_Status     0
dtype: int64


In [57]:
num_cols = ['PM2.5', 'PM10', 'NO2', 'NH3', 'SO2', 'CO', 'O3']
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# For text columns
df['Area'] = df['Area'].fillna('Unknown')

In [58]:
print(df.isnull().sum())


Timestamp       0
City            0
PM2.5           0
PM10            0
NO2             0
NH3             0
SO2             0
CO              0
O3              0
Area            0
PM2.5_Status    0
dtype: int64
