In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv("airquality_data.csv", encoding='cp1252')


  df = pd.read_csv("airquality_data.csv", encoding='cp1252')


DATA CLEANING

In [None]:
# Convert date columns to datetime format
df['sampling_date'] = pd.to_datetime(df['sampling_date'], errors='coerce')
df['date'] = pd.to_datetime(df['date'], errors='coerce')


In [6]:
df = df.drop_duplicates()

# Fill missing values
# For categorical columns, fill with 'Unknown'
categorical_cols = ['stn_code', 'state', 'location', 'agency', 'type', 'location_monitoring_station']
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

# For numerical columns, fill with median values
numerical_cols = ['so2', 'no2', 'rspm', 'spm', 'pm2_5']
df[numerical_cols] = df[numerical_cols].apply(lambda x: x.fillna(x.median()))


DATA INTEGRATION

In [7]:
# Aggregate pollutant levels by location and date
df_summary = df.groupby(['location', 'date'])[numerical_cols].mean().reset_index()

# Merge aggregated data back into the original dataset
df = df.merge(df_summary, on=['location', 'date'], suffixes=('', '_avg'))

DATA TRANSFORMATION

In [8]:
# Creating a new column for AQI (Air Quality Index) approximation
def calculate_aqi(row):
    return (row['so2'] * 0.2) + (row['no2'] * 0.3) + (row['rspm'] * 0.5)  # Example formula

df['AQI'] = df.apply(calculate_aqi, axis=1)

# Normalize numerical values
def normalize(column):
    return (column - column.min()) / (column.max() - column.min())

df[numerical_cols] = df[numerical_cols].apply(normalize)


ERROR CORRECTING

In [9]:
# Removing extreme outliers (values beyond 99th percentile)
def remove_outliers(column):
    upper_limit = column.quantile(0.99)
    return column.apply(lambda x: x if x <= upper_limit else upper_limit)

df[numerical_cols] = df[numerical_cols].apply(remove_outliers)

In [None]:
VISUALIZATIONS

In [None]:

plt.figure(figsize=(10, 5))
sns.lineplot(data=df, x='date', y='AQI', hue='state', alpha=0.7)
plt.title('Time-Series Trend of AQI Across States')
plt.xlabel('Date')
plt.ylabel('AQI')
plt.xticks(rotation=45)
plt.legend(loc='upper right')
plt.show()



In [None]:
plt.figure(figsize=(12, 6))
df_grouped = df.groupby('state')[numerical_cols].mean()
df_grouped.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Average Pollutant Levels by State')
plt.xlabel('State')
plt.ylabel('Pollutant Concentration')
plt.xticks(rotation=90)
plt.legend(title='Pollutants')
plt.show()

plt.figure(figsize=(10, 5))
sns.pairplot(df[numerical_cols])
plt.title('Pairplot of Pollutant Levels')
plt.show()

plt.figure(figsize=(12, 6))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of Pollutants')
plt.show()
# Save the cleaned dataset
df.to_csv('cleaned_airquality.csv', index=False)

print("Data Cleaning and Transformation Completed! Cleaned dataset saved as 'cleaned_airquality.csv'")
