In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 1. Load Data
df = pd.read_csv("datasets/box-hill-air-quality.csv")

# 2. Inspect Data
print(df.head())
print(df.info())

# 3. Preprocess Data
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

# Fill missing values with column mean
for col in [' pm25', ' pm10', ' o3', ' no2', ' so2', ' co']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].fillna(df[col].mean())

# 4. Create lag features for time series prediction
# Predict pm25 for next day using previous 3 days
for lag in range(1, 4):
    df[f'pm25_lag{lag}'] = df[' pm25'].shift(lag)

df = df.dropna()  # Drop rows with NaN after shifting

features = [f'pm25_lag{lag}' for lag in range(1, 4)] + [' pm10', ' o3', ' no2', ' so2', ' co']
target = ' pm25'

X = df[features]
y = df[target]

# 5. Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# 6. Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7. Train and Compare Models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} - MSE: {mse:.2f}, R2: {r2:.2f}")

# 8. Predict Future pm25 with Each Model
last_row = df.iloc[-1]
future_features = [last_row[f'pm25_lag{lag}'] for lag in range(1, 4)] + [last_row[' pm10'], last_row[' o3'], last_row[' no2'], last_row[' so2'], last_row[' co']]
future_features_scaled = scaler.transform([future_features])
for name, model in models.items():
    future_pm25 = model.predict(future_features_scaled)
    print(f"{name} predicted future pm25: {future_pm25[0]:.2f}")

       date  pm25  pm10   o3  no2  so2  co
0  2025/8/1    27    13   13   13        3
1  2025/8/2    32    20   16   12        4
2  2025/8/3    51    11   22    8        1
3  2025/8/4    18    12   22    6         
4  2025/8/5     8    14   18   10        1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3859 entries, 0 to 3858
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    3859 non-null   object
 1    pm25   3859 non-null   object
 2    pm10   3859 non-null   object
 3    o3     3859 non-null   object
 4    no2    3859 non-null   object
 5    so2    3859 non-null   object
 6    co     3859 non-null   object
dtypes: object(7)
memory usage: 211.2+ KB
None
LinearRegression - MSE: 62.46, R2: 0.13
RandomForest - MSE: 59.23, R2: 0.17
RandomForest - MSE: 59.23, R2: 0.17
GradientBoosting - MSE: 57.62, R2: 0.19
LinearRegression predicted future pm25: 21.20
RandomForest predicted future pm25: 15.77
GradientBoosting predicte



In [8]:
print(df.columns)

Index(['date', ' pm25', ' pm10', ' o3', ' no2', ' so2', ' co'], dtype='object')
