In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load DataSet

df = pd.read_csv("../data/globalAirQuality.csv")

In [None]:
# preview dataset
print("info")
print(df.info())

print("first five rows")
print(df.head())

print("rows x columns")
print(df.shape)

print("columns present in the dataset")
print(df.columns)

In [None]:
# Reduced DataFrame

df_main = df[["timestamp","country","city","pm25","aqi"]].copy()

# verify
print("Reduced Dataset")
print(df_main.head())
print("Shape of the reduced dataset")
print(df_main.shape)
print("Columns")
print(df_main.columns)

In [None]:
# Data cleaning

# timestamp dtype change object -> datatime64[ns]
print(df_main["timestamp"].dtype) # object

# operation
df_main["timestamp"] = pd.to_datetime(df_main["timestamp"])

# After cleaning
print(df_main["timestamp"].dtype)

# duplicate handling
print(df_main.duplicated().sum()) # already cleaned

# checking NaN values
print(df_main.isnull().sum()) # already cleaned

# standardize text colums
df_main["country"] = df_main["country"].str.upper()
df_main["city"] = df_main["city"].str.title()

df_main

In [None]:
# Feature Engineering

df_main["air_quality_category" ] = df_main["aqi"].apply(
    lambda x : (
        "Good" if x <=50 else
        "Moderate" if x <= 100 else
        "Poor" if x <= 150 else
        "Very Poor"
    )
)
threshold = 35
df_main["pm25_level"] = df_main["pm25"].apply(
    lambda x : (
        "Low" if x <= 12 else
        "Medium" if x <= 35 else
        "High"
    )
)

df_main["high_pollution"] = df_main["pm25"].apply(
    lambda x : (
        True if x > threshold else False
    )
)



print(df_main.head())

# sanity check
print(df_main["air_quality_category"].value_counts())
print(df_main["pm25_level"].value_counts())
print(df_main["high_pollution"].value_counts())

In [None]:
# Data Analysis

# which country have higher air pollution on average?

country_summary = (df_main.groupby("country").agg(
    avg_pm25 = ("pm25","mean"),
    avg_aqi = ("aqi","mean"),
    high_pollution_count = ("high_pollution","sum")
)
    .reset_index()
)

# Rank countries by pm25
country_summary = country_summary.sort_values(
        by = "avg_pm25",
        ascending = False
)

country_summary["pollution_rank"] = country_summary["avg_pm25"].rank(
    method = "dense",
    ascending = False
)

print(country_summary)

In [None]:
# city wise analysis

city_summary = (
    df_main.groupby("city").agg(
        avg_pm25 = ("pm25","mean"),
        avg_aqi = ("aqi","mean"),
        high_pollution_count = ("high_pollution","sum")
    )
    .reset_index()
)

city_summary = city_summary.sort_values(
    by = "avg_pm25",
    ascending = False
)

city_summary["pollution_rank"] = city_summary["avg_pm25"].rank(
    method = "dense",
    ascending = False
)
print(city_summary.head())


top_10_cities = city_summary.head(10)

print(top_10_cities)

In [None]:
# Data visualization
top_10_cities.hist()

In [None]:
# output
country_summary.to_csv("../output/country_pollution_summary.csv", index = False)

city_summary.to_csv("../output/city_pollution_summary.csv",index=False)

top_10_cities.to_csv("../output/top_10_cities.csv",index = False)
