In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import warnings

In [5]:
df = pd.read_csv("output.csv")
df.head()

Unnamed: 0,co_gt,s1_co,nhmc_gt,c6h6_gt,s2_nhmc,nox_gt,s3_nox,no2_gt,s4_no2,s5_o3,temp,rh,ah
0,0.396825,0.639299,0.220587,11.9,0.529215,0.19092,0.5688,0.438735,0.57878,0.504261,0.333333,0.489,0.287139
1,0.301587,0.576568,0.174091,9.4,0.453255,0.117579,0.6632,0.355731,0.508223,0.35589,0.326882,0.477,0.270955
2,0.333333,0.678044,0.144726,9.0,0.4399,0.150175,0.636,0.442688,0.506101,0.407018,0.296774,0.54,0.283331
3,0.333333,0.654059,0.134937,9.2,0.447412,0.197905,0.5976,0.474308,0.521485,0.471679,0.277419,0.6,0.301618
4,0.238095,0.558118,0.099454,6.5,0.353923,0.150175,0.688,0.450593,0.471618,0.425063,0.28172,0.596,0.30267


In [9]:
NUMERICAL_COLS = ["co_gt", "s1_co", "nhmc_gt", "c6h6_gt", "s2_nhmc", "nox_gt", "s3_nox", "no2_gt", "s4_no2", "s5_o3", "temp", "rh", "ah"]
FEATURES = ["co_gt", "s1_co", "nhmc_gt", "s2_nhmc", "nox_gt", "s3_nox", "no2_gt", "s4_no2", "s5_o3", "temp", "rh", "ah"]
TARGET = ["c6h6_gt"]
MODELS = ["SVM", "RF"]
K_FOLDS = 5

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df[FEATURES], df[TARGET], test_size=0.25, random_state=42)

In [None]:
# Feature Selection

In [None]:
'''
Takes dataframes `X_train` and `y_train`. Performs hyperparameter tuning for models with GridSearchCV.
Retrains all models with the best hyperparameter combinations, and returns a dictionary containing these trained models.
'''
def tune_hyperparams(X_train, y_train):
    warnings.simplefilter(action='ignore', category=FutureWarning)
    
    # SVMs
    svm_params = {
        'C': [0.001, 0.01, 1, 100],
        'kernel': ['linear', 'rbf']
    }

    svm_clf = GridSearchCV(
        estimator=SVC(class_weight='balanced'),
        param_grid=svm_params,
        cv=K_FOLDS,
        refit=True,
        n_jobs=1,
        verbose=2,
        scoring='f1_weighted'
    )

    svm_clf.fit(X_train, y_train)
    
    # RF
    rf_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 50, 100]
    }

    rf_clf = GridSearchCV(
        estimator=RandomForestClassifier(class_weight='balanced'),
        param_grid=rf_params,
        cv=K_FOLDS,
        refit=True,
        n_jobs=1,
        verbose=2,
        scoring='f1_weighted'
    )

    rf_clf.fit(X_train, y_train)
    
    model_dict = {"RF": rf_clf}
    
    return model_dict

In [None]:
model_dict = tune_hyperparams(X_train, y_train)

In [None]:
# Print the best hyperparameters and best scores for each of the models
for model in MODELS:
    print(model_dict[model].best_estimator_)
    print("best_params: ", model_dict[model].best_params_)
    print("best_score: ", model_dict[model].best_score_)

In [None]:
# Retrain models on the entire training set and test on testing set
for m in MODELS:
    final_scores = {}

    if m == '':
        model = ()
    
    model.fit(X_train, y_train)
    y_pred = model.predict(x_test)

    score = score_fn(y_test, y_pred)

    final_scores[model] = score

In [None]:
print(final_scores)