In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

plt.rcParams["figure.figsize"] = (10, 4)

df_raw = pd.read_csv(
    "AirQualityUCI.csv",
    sep=";",
    decimal=",",
    encoding="latin1"
)

print("Raw shape:", df_raw.shape)

df = df_raw.drop(columns=[c for c in df_raw.columns if "Unnamed" in c],
                 errors="ignore")

# HANDLING MISSING DATA
df = df.replace(-200, np.nan)

# Create timestamp from Date + Time
df["timestamp"] = pd.to_datetime(
    df["Date"] + " " + df["Time"],
    format="%d/%m/%Y %H.%M.%S",
    errors="coerce"
)

# Drop rows where timestamp can't be parsed
df = df.dropna(subset=["timestamp"])

# Sort and set as index
df = df.sort_values("timestamp").set_index("timestamp")

# Drop original Date/Time columns
df = df.drop(columns=["Date", "Time"])

print("After timestamp + sorting:", df.shape)

# DROP NMHC(GT)
df = df.drop(columns=["NMHC(GT)"], errors="ignore")

# IMPUTE REMAINING NaNs
numeric_cols = df.select_dtypes(include=[np.number]).columns

# Median imputation
medians = df[numeric_cols].median()
df[numeric_cols] = df[numeric_cols].fillna(medians)

# CREATE DERIVED TIME FEATURES
df["hour"] = df.index.hour
df["weekday"] = df.index.weekday   # 0 = Monday, 6 = Sunday
df["month"] = df.index.month
df["is_weekend"] = (df["weekday"] >= 5).astype(int)

# Cyclical encoding
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
df["weekday_sin"] = np.sin(2 * np.pi * df["weekday"] / 7)
df["weekday_cos"] = np.cos(2 * np.pi * df["weekday"] / 7)

# LAGS & MOVING AVERAGES
pollutants = ["CO(GT)", "C6H6(GT)", "NOx(GT)", "NO2(GT)"]
pollutants = [p for p in pollutants if p in df.columns]

# hours
lags = [1, 6, 12, 24]
# moving avgs
ma_windows = [6, 24]

for col in pollutants:
    for lag in lags:
        df[f"{col}_lag_{lag}h"] = df[col].shift(lag)

    for w in ma_windows:
        df[f"{col}_ma_{w}h"] = df[col].rolling(window=w, min_periods=1).mean()


# SCALING FEATURES
numeric_cols = df.select_dtypes(include=[np.number]).columns

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
df_model = df.dropna()

print("Final modelling dataset shape:", df_model.shape)
df_model.head()

df_model.to_csv("AirQuality_cleaned.csv", index=True)

Raw shape: (9471, 17)
After timestamp + sorting: (9357, 13)
Final modelling dataset shape: (9333, 44)
