### Auroreload Functionality

In [19]:
# Load the autoreload extension to automatically reload modules when they change  
# Enables automatic reloading of all imported modules before executing code

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Importing Libraries

In [56]:
import sys
# append the path to the logs directory
sys.path.append('../logs')
sys.path.append('../src')
sys.path.append('../config')

In [45]:
# inbuild packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from pathlib import Path
import joblib
import logging
import logging.config
import os
# from ucimlrepo import fetch_ucirepo

In [63]:
from models.training import model_train_save

In [64]:
help(model_train_save)

Help on function model_train_save in module models.training:

model_train_save(
    train_data,
    n_estimators,
    contamination,
    sample_size,
    model_path
)



### setting logger

In [46]:
# Ensure logs directory exists
os.makedirs("../logs", exist_ok=True)
#Load logging configuration from file
conf_file = '../config/logging.conf'
logging.config.fileConfig(conf_file)
#logger: see loggers in logging.conf file
logger_name = 'ml_pipeline'
logger = logging.getLogger(logger_name)

In [47]:
# user defined modules
import utils.load_data as load_data

### Load Data

In [49]:
data_file_path = '../data/raw/air_quality.csv'
data = load_data.load_data_as_pandas_df(data_file_path)
logger.info("Data loaded successfully")
data.head()

[2025-03-20 20:44:39] INFO - Data loaded successfully


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,3/10/2004,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
1,3/10/2004,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
2,3/10/2004,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
3,3/10/2004,21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
4,3/10/2004,22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888


In [24]:
data.shape

(9357, 15)

### Feature selection and IDA

In [25]:
# Select features
features = data[['CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)']]

In [26]:
features.head()

Unnamed: 0,CO(GT),C6H6(GT),NOx(GT),NO2(GT)
0,2.6,11.9,166,113
1,2.0,9.4,103,92
2,2.2,9.0,131,114
3,2.2,9.2,172,122
4,1.6,6.5,131,116


In [27]:
# Drop rows with missing values (-200)
features = features.replace(-200, np.nan)

In [50]:
features = features.dropna()
logger.info("Data cleaned successfully")
print(features.shape)
features.head()

[2025-03-20 20:44:49] INFO - Data cleaned successfully
(6941, 4)


Unnamed: 0,CO(GT),C6H6(GT),NOx(GT),NO2(GT)
0,2.6,11.9,166.0,113.0
1,2.0,9.4,103.0,92.0
2,2.2,9.0,131.0,114.0
3,2.2,9.2,172.0,122.0
4,1.6,6.5,131.0,116.0


### Model Configuration

In [59]:
import config

In [62]:
n_estimators = config.n_estimators
contamination = config.contamination
sample_size = config.sample_size
model_path = config.model_path

### Model Training

In [65]:
model = model_train_save(features, n_estimators, contamination, sample_size, model_path)

[2025-03-20 20:55:11] INFO - Model trained successfully
[2025-03-20 20:55:11] INFO - Model saved successfully


In [66]:
model