In [1]:
import os

repo_path = 'Air_Quality_Regression'

if not os.path.isdir(repo_path):
  !git clone https://github.com/Andreluis2001/Air_Quality_Regression.git

Cloning into 'Air_Quality_Regression'...
remote: Enumerating objects: 136, done.[K
remote: Counting objects: 100% (136/136), done.[K
remote: Compressing objects: 100% (112/112), done.[K
remote: Total 136 (delta 43), reused 99 (delta 15), pack-reused 0 (from 0)[K
Receiving objects: 100% (136/136), 2.39 MiB | 17.83 MiB/s, done.
Resolving deltas: 100% (43/43), done.


In [2]:
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from Air_Quality_Regression.Air_Quality_temperature_predictor.utils import split_stratified
from Air_Quality_Regression.Air_Quality_temperature_predictor.data_preprocessing import DataTransformer
from Air_Quality_Regression.Air_Quality_temperature_predictor.model_train import grid_search_model
from Air_Quality_Regression.Air_Quality_temperature_predictor.model_test import test_model_cv

In [3]:
dataframe = pd.read_excel('Air_Quality_Regression/data/AirQualityUCI.xlsx')
dataframe.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [4]:
train_data, test_data = split_stratified(dataframe)

(9357, 16)


In [5]:
num_cols = train_data.select_dtypes(include=np.number).columns.drop('T')
cat_cols = train_data.select_dtypes(exclude=np.number).columns

numerical_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean', missing_values=-200)),
        ('scaler', StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ('transformer', DataTransformer()),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

full_pipeline = ColumnTransformer(
    transformers=[
        ('numerical', numerical_pipeline, num_cols),
        ('categorical', categorical_pipeline, cat_cols)
    ]
)

In [6]:
random_forest_regressor = Pipeline(
    steps=[
        ('preprocessing', full_pipeline),
        ('model', RandomForestRegressor())
    ]
)

decision_tree_regressor = Pipeline(
    steps=[
        ('preprocessing', full_pipeline),
        ('model', DecisionTreeRegressor())
    ]
)

xgb_regressor = Pipeline(
    steps=[
        ('preprocessing', full_pipeline),
        ('model', XGBRegressor())
    ]
)

models = [decision_tree_regressor, random_forest_regressor, xgb_regressor]

In [7]:
tree_regressor_grid = {
    'model__criterion': ['squared_error', 'absolute_error'],
    'model__splitter': ['best', 'random'],
    'model__max_depth': [None],
    'model__min_samples_split': [2],
    'model__min_samples_leaf': [1],
    'model__ccp_alpha': [0.0, 0.2, 0.5],
}

rdf_regressor_grid = {
    'model__n_estimators': [50, 100],
    'model__max_depth': [None],
    'model__min_samples_split': [2],
    'model__min_samples_leaf': [1],
    'model__max_features': [1.0, 'sqrt', 'log2']
}

xgb_regressor_grid = {
    'model__n_estimators': [200, 300, 500],
    'model__max_depth': [None],
    'model__learning_rate': [0.1, 0.2, 0.3]
}

param_grids = [tree_regressor_grid, rdf_regressor_grid, xgb_regressor_grid]

In [8]:
X_train, y_train = train_data.drop(['T'], axis=1), train_data['T']
X_test, y_test = test_data.drop(['T'], axis=1), test_data['T']

y_train_T_mean = y_train.mean()
y_test_T_mean = y_train.mean()
y_train = y_train.replace(-200, y_train_T_mean)
y_test = y_test.replace(-200, y_test_T_mean)

In [9]:
best_models = []
for model, param_grid in zip(models, param_grids):
    best_model = grid_search_model(model, param_grid, X_train, y_train)
    best_models.append(best_model)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(missing_values=-200)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)',
       'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
       'RH', 'AH'],
      dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('transformer',
                                                                   DataTransformer()),
                                                                  ('onehot',
  

In [10]:
for model in best_models:
  scores = test_model_cv(model, X_test, y_test)
  print(f'Mean: {scores.mean()}')
  print(f'Standard Deviation: {scores.std()}')
  print("-" * 50)

Mean: 1.346721427952717
Standard Deviation: 0.14983997794691567
--------------------------------------------------
Mean: 0.7594368866140352
Standard Deviation: 0.14192747774554698
--------------------------------------------------
Mean: 0.7347839001304708
Standard Deviation: 0.11391477270811808
--------------------------------------------------


In [11]:
for model in best_models:
  params = model["model"].get_params()
  print(f'{model["model"]} Params: {params}')
  print("-" * 50)

DecisionTreeRegressor() Params: {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}
--------------------------------------------------
RandomForestRegressor() Params: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
--------------------------------------------------
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_by