# Setup

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures



# Creating the classes

## Data Explorer

In [6]:
class DataExplorer:
    @staticmethod
    def explore_data(data):
        print(data.head().T)
        print(data.describe())
        print(data.info())
    
    @staticmethod
    def plot_histograms(data):
        data.hist(bins=15, figsize=(15, 10))
        plt.show()

    @staticmethod
    def plot_corr_matrix(data,columns):
        data = data[columns]
        plt.figure(figsize=(10,5))
        sns.heatmap(data.corr(),annot=True, fmt=".2f",cmap= 'RdBu' )
        plt.show()
    
    @staticmethod
    def percentage_of_missing_values(data):
        per_miss_val = (data.isna().sum()/len(data))*100
        format_value = per_miss_val.apply(lambda x: f'{x:.2f}%')
        print("Percentage  of missing values")
        print(format_value)


Based on the information provided by the dataset creator, the samples which have missing values were registered as "-200", so we are going to input again an NaN to the samples that fall into this category


# AQ (Air Quality) Model

In [13]:
class AQModel:
    def __init__(self, filepath):
        self.filepath = filepath
        self.model_pipeline = Pipeline([
            ('scaler',MinMaxScaler()),
            ('polynomial',PolynomialFeatures(degree=2,include_bias =  False)),
            ('regression',LinearRegression())
        ])
        self.X_train, self.X_test, self.y_train, self.y_test = [None] * 4
    
    def load_data(self):
        self.data = pd.read_csv(self.filepath)
        DataExplorer.explore_data(self.data)
        return self
    
    def preprocess_data(self):
        self.data = self.data.drop(columns=["Date", "Time", "NMHC(GT)", "Unnamed: 15","Unnamed: 16"])
        self.data = self.data.applymap(lambda x: np.nan if x == -200 else x)
        self.data = self.data.dropna(subset=["CO(GT)"])
        self.data = self.data.apply(lambda col: col.fillna(col.mean()))

        X = self.data.drop("CO(GT)", axis=1)
        y = self.data["CO(GT)"]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=27)
        return self

    def train_model(self):
        self.model_pipeline.fit(self.X_train, self.y_train)
        return self
    
    def evaluate_model(self):
        print("Model Evaluation:")
        y_pre = self.model_pipeline.predict(self.X_test)
        rmse = mean_squared_error(self.y_test, y_pre,squared=False)
        r2 = r2_score(self.y_test, y_pre)

        print(f'RMSE: {rmse:.4f}')
        print(f'R²: {r2:.4f}')
        return self

In [17]:
filepath=r"/home/alt9193/Documents/MLOps_team36/data/AirQualityUCI.csv"

model = AQModel(filepath)
model.load_data()
model.preprocess_data()
model.train_model()
model.evaluate_model()


                       0          1          2          3          4
Date           3/10/2004  3/10/2004  3/10/2004  3/10/2004  3/10/2004
Time            18:00:00   19:00:00   20:00:00   21:00:00   22:00:00
CO(GT)               2.6        2.0        2.2        2.2        1.6
PT08.S1(CO)       1360.0     1292.0     1402.0     1376.0     1272.0
NMHC(GT)           150.0      112.0       88.0       80.0       51.0
C6H6(GT)            11.9        9.4        9.0        9.2        6.5
PT08.S2(NMHC)     1046.0      955.0      939.0      948.0      836.0
NOx(GT)            166.0      103.0      131.0      172.0      131.0
PT08.S3(NOx)      1056.0     1174.0     1140.0     1092.0     1205.0
NO2(GT)            113.0       92.0      114.0      122.0      116.0
PT08.S4(NO2)      1692.0     1559.0     1555.0     1584.0     1490.0
PT08.S5(O3)       1268.0      972.0     1074.0     1203.0     1110.0
T                   13.6       13.3       11.9       11.0       11.2
RH                  48.9       47.

  self.data = self.data.applymap(lambda x: np.nan if x == -200 else x)


<__main__.AQModel at 0x7f330dd82810>

The variable to be predicted is the one called 'CO(GT)'. To avoid introducing bias into the model, we decided to remove the entries where the value of the dependent variable is unknown. 

Analyzing the data, we observed that the columns 'Unnamed:15' and 'Unnamed:16' contain no values, and the column 'NMHC(GT)' has more than 88 percent missing values, so we decided to eliminate these variables. 

Observing the correlations, the variables C6H6 (GT), PT08.S2(NMHC), and PT08.S1(CO) have the highest correlation with our target variable.