In [217]:
import os
import zipfile
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import root_mean_squared_error

import matplotlib.pyplot as plt

In [218]:
def unzip_dataset(
    source: str,
    destination: str,
    remove_source: bool = False
    ) -> Path:

    data_dir_path = Path("data")
    data_path = data_dir_path / destination
    source_path = Path(source)

    if not data_dir_path.is_dir():
      data_dir_path.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(source_path, "r") as zip_ref:
      zip_ref.extractall(data_path)

    if remove_source:
      os.remove(source)

    return data_path

unzip_dataset(
    source="air+quality.zip",
    destination="air_quality"
)

PosixPath('data/air_quality')

In [219]:
original_df = pd.read_excel("data/air_quality/AirQualityUCI.xlsx")
original_df.shape

(9357, 15)

In [220]:
def transform_time(time: int) -> str:
  if time > 5 and time <=12:
    return 'morning'
  elif time > 12 and time <= 17:
    return 'afternoon'
  elif time > 17 and time <= 21:
    return 'evening'
  else:
    return 'night'

def transform_date(date: int) -> str:
  if date > 3 and date <=5:
    return 'spring'
  elif date > 5 and date <= 8:
    return 'summer'
  elif date > 8 and date <= 11:
    return 'autumn'
  else:
    return 'winter'

In [221]:
def split_stratified(dataset):
  dataset["Season"] = dataset["Date"].apply(lambda date: date.month)
  dataset["Season"] = dataset["Season"].apply(transform_date)

  print(dataset.shape)

  train, test = train_test_split(dataset, test_size=0.2, stratify=dataset["Season"], random_state=42)

  for split_ in (train, test):
    split_.drop("Season", axis=1, inplace=True)
  return train, test

changed_df = original_df.copy()
train_data, test_data = split_stratified(changed_df)
X_train, y_train = train_data.drop('T', axis=1), train_data['T']
X_test, y_test = test_data.drop('T', axis=1), test_data['T']

(9357, 16)


In [222]:
class DataTransformer(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    X['Date'] = X['Date'].apply(lambda date: date.month)
    X['Time'] = X['Time'].apply(lambda time: time.hour)
    X['Time'] = X['Time'].apply(transform_time)
    X['Date'] = X['Date'].apply(transform_date)
    return X

In [223]:
X_test.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH
811,2004-04-13,13:00:00,1.5,1009.25,141,7.658833,885.0,112.0,993.75,87.0,1438.5,750.75,38.15,0.700562
1907,2004-05-29,05:00:00,-200.0,737.75,-200,1.5012,544.75,18.0,1562.0,29.0,1265.75,449.5,49.749999,0.996961
2998,2004-07-13,16:00:00,1.7,905.0,-200,8.775651,930.5,124.0,875.5,90.0,1460.25,775.75,21.3,0.805444
4304,2004-09-06,02:00:00,0.6,854.0,-200,1.70607,561.5,-200.0,1105.5,-200.0,1224.25,400.0,42.95,1.290619
6001,2004-11-15,19:00:00,3.8,1029.0,-200,16.594653,1197.75,446.0,693.5,175.0,1305.0,1168.5,30.2,0.438446


In [224]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy="mean", missing_values=-200)),
        ('scaler', StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ('transformer', DataTransformer()),
        ('ohe_encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    ]
)

In [225]:
numerical_columns = X_train.select_dtypes(include=np.number).columns.to_list()
categorical_columns = X_train.select_dtypes(exclude=np.number).columns.to_list()

In [226]:
full_pipeline = ColumnTransformer(
    transformers=[
        ('numerical', numerical_pipeline, numerical_columns),
        ('categorical', categorical_pipeline, categorical_columns)
    ]
)

In [227]:
X_train = full_pipeline.fit_transform(X_train)
X_test = full_pipeline.fit_transform(X_test)

In [228]:
def train_model(model, X_train, y_train, X_test, y_test):
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)
  metrics = root_mean_squared_error(y_test, predictions)
  return metrics

def test_model_cv(model, X_test, y_test):
  scores = -cross_val_score(model, X_test, y_test, scoring="neg_root_mean_squared_error", cv=10)
  print(f"Model: {model}\n")
  print(f"Scores: {scores}\n")
  print(f"Mean: {scores.mean()}\n")
  print(f"Standard Deviation: {scores.std()}\n")
  return scores

In [229]:
def grid_search_model(model, param_grid, X, y):

  print(f'{model}\n')
  grid_search = GridSearchCV(
      model,
      param_grid,
      cv=5,
      scoring='neg_root_mean_squared_error',
      verbose=1)
  grid_search.fit(X, y)
  best_model = grid_search.best_estimator_
  return best_model

In [230]:
tree_reg = DecisionTreeRegressor()
rdf_reg = RandomForestRegressor()
linear_reg = LinearRegression()
sgd_reg = SGDRegressor()
svm_reg = SVR()
models = [tree_reg, rdf_reg, sgd_reg, svm_reg]

In [231]:
sgd_regressor_grid = {
    'loss': ['squared_error', 'huber'],
    'penalty': ['l2', 'l1'],
    'alpha': [0.00001, 0.0001, 0.001]
}


svm_regressor_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

tree_regressor_grid = {
    'criterion': ['squared_error', 'absolute_error'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rdf_regressor_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

param_grids = [tree_regressor_grid, rdf_regressor_grid, sgd_regressor_grid, svm_regressor_grid]

In [232]:
best_models = []
best_models.append(linear_reg)
for model, grid in zip(models, param_grids):
  %time best_models.append(grid_search_model(model, grid, X_train, y_train))

DecisionTreeRegressor()

Fitting 5 folds for each of 48 candidates, totalling 240 fits
CPU times: user 2min 50s, sys: 164 ms, total: 2min 51s
Wall time: 2min 51s
RandomForestRegressor()

Fitting 5 folds for each of 24 candidates, totalling 120 fits
CPU times: user 11min 19s, sys: 726 ms, total: 11min 20s
Wall time: 11min 20s
SGDRegressor()

Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: user 2.59 s, sys: 9 µs, total: 2.59 s
Wall time: 2.58 s
SVR()

Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: user 2min 20s, sys: 2.28 s, total: 2min 23s
Wall time: 2min 23s


In [233]:
for model in best_models:
  %time test_model_cv(model, X_test, y_test)
  print("-" * 50)

Model: LinearRegression()

Scores: [32.02191317 41.04046308 42.33469825 36.34377923 31.89385081 41.61987499
 44.07432543 37.44245111 44.40483451 31.14248892]

Mean: 38.23186795192198

Standard Deviation: 4.915497868784821

CPU times: user 47.5 ms, sys: 1 µs, total: 47.5 ms
Wall time: 25.7 ms
--------------------------------------------------
Model: DecisionTreeRegressor(max_depth=20, min_samples_leaf=2)

Scores: [ 1.44073153 22.46569045  1.46477845 22.85461695  1.25817603  1.46816338
  1.4378692   1.42729761  1.42901725  1.26324282]

Mean: 5.650958367138999

Standard Deviation: 8.505349913909132

CPU times: user 299 ms, sys: 2 ms, total: 301 ms
Wall time: 186 ms
--------------------------------------------------
Model: RandomForestRegressor(max_depth=20, min_samples_leaf=2)

Scores: [ 0.87570083 10.56402969  0.93372797 14.63528967  3.11989619  0.97875557
  0.72771052  0.89157076  0.90259357  5.89464544]

Mean: 3.9523920206456453

Standard Deviation: 4.677529215604489

CPU times: user 9

In [235]:
model_names = ['Linear_model', 'DecisionTree_model', 'RandomForest_model', 'SGD_model', 'SVM_model']
models_path = Path('models')

models_path.mkdir(parents=True, exist_ok=True)

for model, model_name in zip(best_models, model_names):
  with open(models_path / f'{model_name}.pkl', 'wb') as file:
    pickle.dump(model, file)