<a href="https://colab.research.google.com/github/45-ak/CPS5004-Project/blob/main/CPS5004_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load the dataset
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('AirQuality-Health-UK.csv', header=0, sep=",")

print(df.shape)
print(df.columns.tolist())
print(df.head())

# Column Mapping Helper
CANDIDATES = {
    "date": ["Date", "date", "DATE"],
    "city": ["City", "city", "location", "Location"],
    "pm25": ["PM2.5", "PM25", "pm25"],
    "no2": ["NO2", "no2"],
    "o3": ["O3", "o3"],
    "temperature": ["Temperature", "temperature", "Temp", "temp"],
    "humidity": ["Humidity", "humidity"],
    "admissions": ["HospitalAdmissions", "Admissions", "admissions", "Hospital_Admissions"],
    "population": ["Population", "population",],

}

def pick_column(df, options, required=True):
  for col in options:
    if col in df.columns:
      return col
  if required:
    raise KeyError(f"None of the expected columns found: {options}")
  return None

COL = {k: pick_column(df, v, required=(k not in['temperature', 'humidity'])) for k, v in CANDIDATES.items()}
COL



# Describe the dataset

# Pre-Processing: Missing Values, Outliers, Datetime and season

# Remove rows where fields are missing
key_fields = [COL['date'], COL['city'], COL['pm25'], COL['no2'], COL['o3'], COL['admissions'], COL['population']]
df = df.dropna(subset=key_fields).copy()

# Optional fill temperature/humidity

# Outlier handling using z-score
for pollutant in [COL['pm25'], COL['no2'], COL['o3']]:
  z = np.abs(stats.zscore(df[pollutant].astype(float)))
  cap = df[pollutant][z < 4]
  lower, upper = cap.quantile(0.01), cap.quantile(0.99)
  df[pollutant] = df[pollutant].clip(lower, upper)

# Convert date to datetime
df[COL['date']] = pd.to_datetime(df[COL['date']], errors='coerce')

# Extracting month and season
df['Month'] = df[COL['date']].dt.month
SEASON_MAP ={12:'Winter', 1:'Winter', 2:'Winter',
             3:'Winter', 4:'Spring', 5:'Spring',
             6:'Summper', 7:'Summer', 8:'Summer',
             9:'Autumn', 10:'Autumn', 11:'Autumn' }
df['Season'] = df['Month'].map(SEASON_MAP)

display(df[[COL['date'], 'Month', 'Season']].head())
print("Preprocessing completed. Rows remaining:", len(df))



(1825, 10)
['City', 'Date', 'PM2.5', 'NO2', 'O3', 'Temperature', 'Humidity', 'Hospital_Admissions', 'Population', 'Season']
         City        Date  PM2.5    NO2     O3  Temperature  Humidity  \
0      London  01/01/2023  14.85  29.55  30.31         9.18     86.21   
1  Manchester  01/01/2023  17.81  24.89  29.09         8.40     57.38   
2  Birmingham  01/01/2023  17.40  20.47  29.33        14.25     76.80   
3     Glasgow  01/01/2023  10.73  38.97  25.60        10.99     55.99   
4   Liverpool  01/01/2023   8.36  29.89  18.47        16.80     78.19   

   Hospital_Admissions  Population  Season  
0                   17      746103  Winter  
1                   14      534097  Winter  
2                    6      630659  Winter  
3                   12      830467  Winter  
4                   13      948429  Winter  


Unnamed: 0,Date,Month,Season
0,2023-01-01,1.0,Winter
1,2023-01-01,1.0,Winter
2,2023-01-01,1.0,Winter
3,2023-01-01,1.0,Winter
4,2023-01-01,1.0,Winter


Preprocessing completed. Rows remaining: 1825
