## Library Import

In [1]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

## Load Dataset

In [2]:
df = pd.read_csv("air_quality_clean.csv", index_col="Datetime", parse_dates=True)

## Classification

In [3]:
horizons = [1, 6, 12, 24]
train_ratio = 0.7
N = len(df)
X = df.copy().drop(columns=["CO_class"])
results = []

for h in horizons:
    print(f"\n=== Model training for horizon: {h} ===")
    df[f"y_{h}"] = df["CO_class"].shift(-h)
    N_eff = N - h
    train_n = int(N_eff * train_ratio)

    y = df[f"y_{h}"]

    X_train = X[:train_n]
    y_train = y[:train_n]

    X_test = X[train_n:N_eff]
    y_test = y[train_n:N_eff]

    # Logistic Regression
    # print("Logistic Regression model training...")
    # lr_classifier = Pipeline([
    #     ("scaler", StandardScaler()),
    #     ("clf", LogisticRegression(
    #         multi_class="multinomial",
    #         max_iter=1000,
    #         random_state=42
    #     ))
    # ])
    # start_training_time = time.perf_counter()
    # lr_classifier.fit(X_train, y_train)
    # end_training_time = time.perf_counter()
    # training_time = end_training_time - start_training_time
    # lr_classifier_pred = lr_classifier.predict(X_test)
    # lr_classifier_acc = accuracy_score(y_test, lr_classifier_pred)
    # print(f"Finished using Logistic Regression model! Training time: {training_time:.4f}s, Accuracy: {lr_classifier_acc:.4f}")
    lr_classifier_acc = 0
    

    # Random Forest
    print("Random Forest model training...")
    rf_classifier = RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        random_state=42
    )
    start_training_time = time.perf_counter()
    rf_classifier.fit(X_train, y_train)
    end_training_time = time.perf_counter()
    training_time = end_training_time - start_training_time
    rf_classifier_pred = rf_classifier.predict(X_test)
    rf_classifier_acc = accuracy_score(y_test, rf_classifier_pred)
    print(f"Finished using Random Forest model! Training time: {training_time:.4f}s, Accuracy: {rf_classifier_acc:.4f}")

    # Gradient Boosting
    # print("Gradient Boosting model training...")
    # gb_classifier = GradientBoostingClassifier(
    #     n_estimators=200,
    #     learning_rate=0.05,
    #     random_state=42
    # )
    # start_training_time = time.perf_counter()
    # gb_classifier.fit(X_train, y_train)
    # end_training_time = time.perf_counter()
    # training_time = end_training_time - start_training_time
    # gb_classifier_pred = gb_classifier.predict(X_test)
    # gb_classifier_acc = accuracy_score(y_test, gb_classifier_pred)
    # print(f"Finished using Gradient Boosting model! Training time: {training_time:.4f}s, Accuracy: {gb_classifier_acc:.4f}")
    gb_classifier_acc = 0

    # Decision Tree
    print("Decision Tree model training...")
    dt_classifier = DecisionTreeClassifier(
        max_depth=6,         # you can tune this
        min_samples_leaf=10, # to avoid overfitting
        random_state=42
    )
    start_training_time = time.perf_counter()
    dt_classifier.fit(X_train, y_train)
    end_training_time = time.perf_counter()
    training_time = end_training_time - start_training_time
    dt_classifier_pred = dt_classifier.predict(X_test)
    dt_classifier_acc = accuracy_score(y_test, dt_classifier_pred)
    print(f"Finished using Decision Tree model! Training time: {training_time:.4f}s, Accuracy: {dt_classifier_acc:.4f}")

    # Support Vector Machine
    # print("Support Vector Machine model training...")
    # svm_classifier = Pipeline([
    #     ("scaler", StandardScaler()),  # SVM needs scaling
    #     ("clf", SVC(
    #         kernel="rbf",
    #         C=1.0,
    #         gamma="scale",
    #         probability=False,  # True if you need predict_proba
    #         random_state=42
    #     ))
    # ])
    # start_training_time = time.perf_counter()
    # svm_classifier.fit(X_train, y_train)
    # end_training_time = time.perf_counter()
    # training_time = end_training_time - start_training_time
    # svm_classifier_pred = svm_classifier.predict(X_test)
    # svm_classifier_acc = accuracy_score(y_test, svm_classifier_pred)
    # print(f"Finished using Support Vector Machine model! Training time: {training_time:.4f}s, Accuracy: {svm_classifier_acc:.4f}")
    svm_classifier_acc = 0

    # Baseline Accuracy
    baseline_acc = accuracy_score(y_test, df["CO_class"][train_n:N_eff])
    print(f"Baseline Accuracy:{baseline_acc:.4f}")
    
    results.append([
        h,
        lr_classifier_acc,
        rf_classifier_acc,
        gb_classifier_acc,
        dt_classifier_acc,
        svm_classifier_acc,
        baseline_acc
    ])


=== Model training for horizon: 1 ===
Random Forest model training...
Finished using Random Forest model! Training time: 3.7742s, Accuracy: 0.7854
Decision Tree model training...
Finished using Decision Tree model! Training time: 0.0899s, Accuracy: 0.7675
Baseline Accuracy:0.7661

=== Model training for horizon: 6 ===
Random Forest model training...
Finished using Random Forest model! Training time: 4.0782s, Accuracy: 0.6238
Decision Tree model training...
Finished using Decision Tree model! Training time: 0.0853s, Accuracy: 0.5527
Baseline Accuracy:0.4116

=== Model training for horizon: 12 ===
Random Forest model training...
Finished using Random Forest model! Training time: 4.0752s, Accuracy: 0.5810
Decision Tree model training...
Finished using Decision Tree model! Training time: 0.0857s, Accuracy: 0.4784
Baseline Accuracy:0.4097

=== Model training for horizon: 24 ===
Random Forest model training...
Finished using Random Forest model! Training time: 4.2430s, Accuracy: 0.5764
Deci

## Results

In [4]:
df_results = pd.DataFrame(
    results,
    columns=[
        "Horizon",
        "Logistic Regression Accuracy",
        "Random Forest Accuracy",
        "Gradient Boosting Accuracy",
        "Decision Tree Accuracy",
        "SVM Accuracy",
        "Baseline Accuracy"
    ]
)

df_results

Unnamed: 0,Horizon,Logistic Regression Accuracy,Random Forest Accuracy,Gradient Boosting Accuracy,Decision Tree Accuracy,SVM Accuracy,Baseline Accuracy
0,1,0,0.785357,0,0.7675,0,0.766071
1,6,0,0.623794,0,0.552697,0,0.411576
2,12,0,0.58098,0,0.47837,0,0.409725
3,24,0,0.576441,0,0.567848,0,0.602578
