In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load and clean dataset
df = pd.read_csv("./datasets/data.csv", encoding="cp1252")
df = df.drop(columns=['Unnamed: 15', 'Unnamed: 16'])  # Drop empty columns

# Convert columns to numeric and handle date parsing
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.replace(',', '.').astype(float, errors='ignore')

df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)

# Drop rows with missing dates (optional)
df = df.dropna(subset=['Date'])

# Set plot style
plt.style.use('seaborn-darkgrid')

# 1. Plotting AQI Trends over time for selected pollutants
plt.figure(figsize=(14, 7))
for pollutant in ['CO(GT)', 'NO2(GT)', 'C6H6(GT)']:
    plt.plot(df['Date'], df[pollutant], label=pollutant)

plt.xlabel("Date")
plt.ylabel("Pollutant Level")
plt.title("Pollutant Levels over Time")
plt.legend()
plt.xticks(rotation=45)
plt.show()

# 2. Monthly Trend for AQI
df['Month'] = df['Date'].dt.to_period('M')
monthly_avg = df.groupby('Month')[['CO(GT)', 'NO2(GT)', 'C6H6(GT)']].mean()

monthly_avg.plot(figsize=(14, 7), marker='o')
plt.xlabel("Month")
plt.ylabel("Average Pollutant Level")
plt.title("Monthly Average Pollutant Levels")
plt.xticks(rotation=45)
plt.show()

# 3. Box plot for distribution of pollutants
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[['CO(GT)', 'NO2(GT)', 'C6H6(GT)']])
plt.title("Distribution of Pollutants")
plt.ylabel("Pollutant Level")
plt.show()

# 4. Scatter Plot for Correlation Analysis (CO vs. NO2)
plt.figure(figsize=(8, 6))
plt.scatter(df['CO(GT)'], df['NO2(GT)'], alpha=0.5, color='teal')
plt.xlabel("CO Level")
plt.ylabel("NO2 Level")
plt.title("Scatter Plot of CO vs NO2 Levels")
plt.show()

# 5. Heatmap for Correlation between Pollutants
plt.figure(figsize=(10, 8))
corr = df[['CO(GT)', 'NO2(GT)', 'C6H6(GT)', 'T', 'RH', 'AH']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation between Pollutants and Weather Conditions")
plt.show()


  df = pd.read_csv("./datasets/data.csv", encoding="cp1252")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435742 entries, 0 to 435741
Data columns (total 13 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   stn_code                     291665 non-null  object 
 1   sampling_date                435739 non-null  object 
 2   state                        435742 non-null  object 
 3   location                     435739 non-null  object 
 4   agency                       286261 non-null  object 
 5   type                         430349 non-null  object 
 6   so2                          401096 non-null  float64
 7   no2                          419509 non-null  float64
 8   rspm                         395520 non-null  float64
 9   spm                          198355 non-null  float64
 10  location_monitoring_station  408251 non-null  object 
 11  pm2_5                        9314 non-null    float64
 12  date                         435735 non-null  object 
dtyp

KeyError: 'AQI'

<Figure size 1000x600 with 0 Axes>