In [1]:
import os
import zipfile
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import root_mean_squared_error

import matplotlib.pyplot as plt

In [2]:
def unzip_dataset(
    source: str,
    destination: str,
    remove_source: bool = False
    ) -> Path:

    data_dir_path = Path("data")
    data_path = data_dir_path / destination
    source_path = Path(source)

    if not data_dir_path.is_dir():
      data_dir_path.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(source_path, "r") as zip_ref:
      zip_ref.extractall(data_path)

    if remove_source:
      os.remove(source)

    return data_path

unzip_dataset(
    source="air+quality.zip",
    destination="air_quality"
)

PosixPath('data/air_quality')

In [3]:
original_df = pd.read_excel("data/air_quality/AirQualityUCI.xlsx")
original_df.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [4]:
date_time_changed_df = original_df.copy()

date_time_changed_df["Time"] = date_time_changed_df["Time"].apply(lambda time: time.hour)
date_time_changed_df["Date"] = date_time_changed_df["Date"].apply(lambda date: date.month)

In [5]:
def transform_time(time: int) -> str:
  if time > 5 and time <=12:
    return 'morning'
  elif time > 12 and time <= 17:
    return 'afternoon'
  elif time > 17 and time <= 21:
    return 'evening'
  else:
    return 'night'

def transform_date(date: int) -> str:
  if date > 3 and date <=5:
    return 'spring'
  elif date > 5 and date <= 8:
    return 'summer'
  elif date > 8 and date <= 11:
    return 'autumn'
  else:
    return 'winter'

In [6]:
date_time_changed_df['Time'] = date_time_changed_df['Time'].apply(transform_time)
date_time_changed_df['Date'] = date_time_changed_df['Date'].apply(transform_date)

In [7]:
imputed_df = date_time_changed_df.copy()

In [8]:
numerical_cols = imputed_df.select_dtypes(include=np.number).columns.to_list()
categorical_cols = imputed_df.select_dtypes(exclude=np.number).columns.to_list()

In [9]:
ohe_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
imputed_df_one_hot = ohe_encoder.fit_transform(imputed_df[categorical_cols])
imputed_df_one_hot = pd.DataFrame(imputed_df_one_hot, columns=ohe_encoder.get_feature_names_out())

In [10]:
imputed_df = pd.concat([imputed_df.drop(categorical_cols, axis=1), imputed_df_one_hot], axis=1)

In [11]:
train, test = train_test_split(imputed_df, test_size=0.2, random_state=42)

In [12]:
train_imputer = SimpleImputer(strategy="mean", missing_values=-200)
test_imputer = SimpleImputer(strategy="mean", missing_values=-200)
train_imputer.fit(train[numerical_cols])
test_imputer.fit(test[numerical_cols])

In [13]:
train[numerical_cols] = train_imputer.transform(train[numerical_cols])
test[numerical_cols] = test_imputer.transform(test[numerical_cols])

In [14]:
X_train, y_train = train.drop(['T'], axis=1), train['T']
X_test, y_test = test.drop(['T'], axis=1), test['T']

In [15]:
def scaled_model(model):
  pipeline = Pipeline(
      steps=[
          ('scaler', StandardScaler()),
          ('model', model)
      ]
  )
  return pipeline

In [16]:
def train_model(model, X_train, y_train, X_test, y_test):
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)
  metrics = root_mean_squared_error(y_test, predictions)
  return metrics

def test_model_cv(model, X_test, y_test):
  scores = -cross_val_score(model, X_test, y_test, scoring="neg_root_mean_squared_error", cv=10)
  print(f"Model: {model}\n")
  print(f"Scores: {scores}\n")
  print(f"Mean: {scores.mean()}\n")
  print(f"Standard Deviation: {scores.std()}\n")
  return scores

In [17]:
def grid_search_model(model, param_grid, X, y):

  print(f'{model}\n')
  grid_search = GridSearchCV(
      model,
      param_grid,
      cv=5,
      scoring='neg_root_mean_squared_error',
      verbose=1)
  grid_search.fit(X, y)
  best_model = grid_search.best_estimator_
  return best_model

In [18]:
tree_reg = DecisionTreeRegressor()
rdf_reg = RandomForestRegressor()
linear_reg = scaled_model(LinearRegression())
sgd_reg = scaled_model(SGDRegressor())
svm_reg = scaled_model(SVR())
models = [tree_reg, rdf_reg]

In [19]:
sgd_regressor_grid = {
    'model__loss': ['squared_error', 'huber'],
    'model__penalty': ['l2', 'l1'],
    'model__alpha': [0.00001, 0.0001, 0.001]
}


svm_regressor_grid = {
    'model__kernel': ['linear', 'rbf'],
    'model__C': [0.1, 1, 10],
    'model__gamma': ['scale', 'auto']
}

tree_regressor_grid = {
    'criterion': ['squared_error', 'absolute_error'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rdf_regressor_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

param_grids = [tree_regressor_grid, rdf_regressor_grid]

In [20]:
best_models = []
best_models.append(linear_reg)
for model, grid in zip(models, param_grids):
  %time best_models.append(grid_search_model(model, grid, X_train, y_train))

DecisionTreeRegressor()

Fitting 5 folds for each of 48 candidates, totalling 240 fits
CPU times: user 3min 21s, sys: 206 ms, total: 3min 21s
Wall time: 3min 28s
RandomForestRegressor()

Fitting 5 folds for each of 24 candidates, totalling 120 fits
CPU times: user 19min 8s, sys: 2.78 s, total: 19min 11s
Wall time: 19min 14s


In [21]:
for model in best_models:
  %time test_model_cv(model, X_test, y_test)
  print("-" * 50)

Model: Pipeline(steps=[('scaler', StandardScaler()), ('model', LinearRegression())])

Scores: [2.38516818 2.1266298  2.19822802 2.17812065 1.97392312 2.51248544
 2.15825691 2.13966169 2.1324731  2.20486243]

Mean: 2.200980933546338

Standard Deviation: 0.14102031422088968

CPU times: user 259 ms, sys: 2.98 ms, total: 262 ms
Wall time: 139 ms
--------------------------------------------------
Model: DecisionTreeRegressor(max_depth=20, min_samples_leaf=2)

Scores: [1.36405258 1.43467899 1.16886828 1.47550373 1.00231625 1.15262627
 1.29740585 1.20662861 1.72080001 1.29050549]

Mean: 1.3113386051071652

Standard Deviation: 0.19109552111042932

CPU times: user 402 ms, sys: 2 ms, total: 404 ms
Wall time: 301 ms
--------------------------------------------------
Model: RandomForestRegressor(n_estimators=200)

Scores: [0.92196595 0.63006847 0.76214175 0.53129804 0.74284374 0.75461811
 0.78622299 0.61007822 0.99524487 0.80369512]

Mean: 0.7538177248512054

Standard Deviation: 0.1325816887010953

In [22]:
model_names = ['DecisionTree_model', 'RandomForest_model']
models_path = Path('models')

models_path.mkdir(parents=True, exist_ok=True)

for model, model_name in zip(best_models, model_names):
  with open(models_path / f'{model_name}.pkl', 'wb') as file:
    pickle.dump(model, file)