### Auroreload Functionality

In [29]:
# Load the autoreload extension to automatically reload modules when they change  
# Enables automatic reloading of all imported modules before executing code

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Importing Libraries

In [30]:
import sys
# append the path to the logs directory
sys.path.append('../logs')
sys.path.append('../src')
sys.path.append('../config')

In [31]:
# inbuild packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from pathlib import Path
import joblib
import logging
import logging.config
import os
# from ucimlrepo import fetch_ucirepo

In [40]:
# user defined modules
import utils.load_data as load_data
import config
from models.training import model_train_save

### setting logger

Packages loaded successfully


In [33]:
# Ensure logs directory exists
os.makedirs("../logs", exist_ok=True)
#Load logging configuration from file
conf_file = '../config/logging.conf'
logging.config.fileConfig(conf_file)
#logger: see loggers in logging.conf file
logger_name = 'ml_pipeline'
logger = logging.getLogger(logger_name)

In [36]:
print("Packages loaded successfully")
logger.info("Packages loaded successfully")
logger.debug("Packages loaded successfully")

Packages loaded successfully
[2025-03-21 20:39:47] INFO - Packages loaded successfully


### Config

In [38]:
n_estimators = config.n_estimators
contamination = config.contamination
sample_size = config.sample_size
model_path = config.model_path
data_file_path = config.data_file_path

### Training pipeline

In [41]:
# Select features
def features_selection(data):
    features = data[['CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)']]
    # Drop rows with missing values (-200)
    features = features.replace(-200, np.nan)
    features = features.dropna()
    logger.info("Features selected successfully")
    logger.info("Shape of features: {}".format(features.shape))
    return features


In [42]:
def training_pipeline(n_estimators, contamination, sample_size, model_path, data_file_path):
    data = load_data.load_data_as_pandas_df(data_file_path)
    logger.info("Data loaded successfully")
    data.head()
    featurs = features_selection(data)
    model = model_train_save(featurs, n_estimators, contamination, sample_size, model_path)
    logger.info("Model trained and saved successfully")
    return model

In [43]:
model = training_pipeline(n_estimators, contamination, sample_size, model_path, data_file_path)

[2025-03-21 20:50:28] INFO - Data loaded successfully
[2025-03-21 20:50:28] INFO - Features selected successfully
[2025-03-21 20:50:28] INFO - Shape of features: (6941, 4)
[2025-03-21 20:50:28] INFO - Model trained successfully
[2025-03-21 20:50:28] INFO - Model saved successfully
[2025-03-21 20:50:28] INFO - Model trained and saved successfully


In [44]:
model