### Adding packages

In [1]:
# init environment
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

import numpy as np
import pandas as pd
import scipy.stats as st
import math
import matplotlib.text as plttxt

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
plt.style.use('bmh')

from dateutil import parser

from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import r2_score

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RepeatedKFold, cross_validate
from sklearn.compose import TransformedTargetRegressor, make_column_transformer
from sklearn.metrics import PredictionErrorDisplay, median_absolute_error, mean_absolute_error
import scipy as sp
from sklearn.linear_model import RidgeCV, LassoCV
# from regressors.stats import coef_pval
%matplotlib inline

### Function for data preparation

In [2]:
def data_prep(df):
    # check empty rows and drop them
    df.dropna(how='all', inplace=True)
    # drop last 2 columns
    df.drop(['Unnamed: 15', 'Unnamed: 16'], axis=1, inplace=True)
    
    #change objects to float
    df['DateTime'] = (df['Date'] + ' ' + df['Time'].str.replace('.', ':')).map(parser.parse)
    df['Date'] = df['Date'].map(parser.parse)
    df['Hour'] = df['Time'].map(lambda x: int(x[0:2]))
    df['C6H6(GT)'] = df['C6H6(GT)'].map(lambda x: float(x.replace(',','.')))
    df['CO(GT)'] = df['CO(GT)'].map(lambda x: float(x.replace(',','.')))
    df['T'] = df['T'].map(lambda x: float(x.replace(',','.')))
    df['RH'] = df['RH'].map(lambda x: float(x.replace(',','.')))
    df['AH'] = df['AH'].map(lambda x: float(x.replace(',','.')))
    #by default missing values are filled with -200
    df.replace(-200, np.nan, inplace=True)
    
    df.dropna(thresh=5, inplace=True)
    
    X = df.iloc[:, 2:15]
    # imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    imputer = KNNImputer(n_neighbors=4, weights='distance')
    df.iloc[:, 2:15] = imputer.fit_transform(df.iloc[:, 2:15])

    return df

### Data extraction

In [3]:
file_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip'
resp = urlopen(file_url)
zipfile = ZipFile(BytesIO(resp.read()))
data = zipfile.open('AirQualityUCI.csv')
df = pd.read_csv(data, sep=';')

### Data preparation

In [4]:
df = data_prep(df)

## Formatting datasets

In [7]:
Y_df = df["C6H6(GT)"]
#  Skip "PT08.S2(NMHC)" as it has functional dependency on with the dependent variable "C6H6(GT)"
# X_df = df.drop(labels=['C6H6(GT)', 'PT08.S2(NMHC)', 'Date', 'Time', 'DateTime'], inplace=False, axis=1)
X_df = df['PT08.S4(NO2)']
X_df_c = X_df.copy()

In [8]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X_df, Y_df, random_state=42)

## Create model

In [9]:
#To improve mode performance, we can use a normalization and apply log to the target variable to turn it approximately into a normal distribution
# define model
preprocessor = make_column_transformer(
    (Normalizer(), X_df.columns), # normalize all features
    remainder="passthrough",
    verbose_feature_names_out=False,  # avoid to prepend the preprocessor names
)

model = make_pipeline(
    preprocessor,
    TransformedTargetRegressor(
        regressor=LinearRegression(), func=np.log10, inverse_func=sp.special.exp10 # apply log to target variable to turn it approximately into a normal distribution
    ),
)

AttributeError: 'Series' object has no attribute 'columns'

## Evaluate model

In [8]:
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X_df, Y_df, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean MAE: 1.033 (0.039)


In [10]:
d = {'Imputer': ['SimpleImputer', 'KNNImputer', 'KNNImputer', 'KNNImputer', 'KNNImputer', 'KNNImputer', 'KNNImputer', 'KNNImputer']
   , 'n_neighbors': [np.nan, 2, 3, 4, 5, 6, 10, 20]
   , 'Mean MAE' : [1.137, 1.069, 1.052, 1.044, 1.040, 1.038, 1.033, 1.024]
   , 'StDev MAE' : [0.038, 0.042, 0.039, 0.038, 0.039, 0.039, 0.039, 0.040]}
df_LinearRegression = pd.DataFrame(data=d)

In [11]:
df_LinearRegression

Unnamed: 0,Imputer,n_neighbors,Mean MAE,StDev MAE
0,SimpleImputer,,1.137,0.038
1,KNNImputer,2.0,1.069,0.042
2,KNNImputer,3.0,1.052,0.039
3,KNNImputer,4.0,1.044,0.038
4,KNNImputer,5.0,1.04,0.039
5,KNNImputer,6.0,1.038,0.039
6,KNNImputer,10.0,1.033,0.039
7,KNNImputer,20.0,1.024,0.04


After analyzing the outcomes, it appears that employing the KNN imputer is more suitable for this task. For optimal performance, the recommended number of neighbors for the linear regression model might be 4. Increasing the number of neighbors beyond this threshold does not significantly reduce the absolute error and instead results in greater deviation of errors.

##  Linear regression model with poly features

In [12]:
# polinomial model

preprocessor = make_column_transformer(
    (PolynomialFeatures(2), X_df['PT08.S4(NO2)']), #
    remainder="passthrough",
    verbose_feature_names_out=False,  # avoid to prepend the preprocessor names
)

model_plnm = make_pipeline(
    preprocessor,
    TransformedTargetRegressor(
        regressor=LinearRegression()

    ),
)

# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model_plnm, X_df, Y_df, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

KeyError: 'PT08.S4(NO2)'

In [27]:
d = {'Imputer': ['SimpleImputer', 'KNNImputer', 'KNNImputer', 'KNNImputer', 'KNNImputer', 'KNNImputer', 'KNNImputer', 'KNNImputer']
   , 'n_neighbors': [np.nan, 2, 3, 4, 5, 6, 10, 20]
   , 'Mean MAE' : [0.834, 0.711, 0.697, 0.693, 0.690, 0.688, 0.683, 0.679]
   , 'StDev MAE' : [0.026, 0.021, 0.021, 0.020, 0.019, 0.019, 0.020, 0.020]}
df_LinearRegression_PF = pd.DataFrame(data=d)

In [28]:
df_LinearRegression_PF

Unnamed: 0,Imputer,n_neighbors,Mean MAE,StDev MAE
0,SimpleImputer,,0.834,0.026
1,KNNImputer,2.0,0.711,0.021
2,KNNImputer,3.0,0.697,0.021
3,KNNImputer,4.0,0.693,0.02
4,KNNImputer,5.0,0.69,0.019
5,KNNImputer,6.0,0.688,0.019
6,KNNImputer,10.0,0.683,0.02
7,KNNImputer,20.0,0.679,0.02


After analyzing the outcomes, it appears that employing the KNN imputer is more suitable for this task. For optimal performance, the recommended number of neighbors for the linear regression model with polynomial Features might be 5. Increasing the number of neighbors beyond this threshold does not significantly reduce the absolute error and instead results in greater deviation of errors.