### Auroreload Functionality

In [1]:
# Load the autoreload extension to automatically reload modules when they change  
# Enables automatic reloading of all imported modules before executing code

%load_ext autoreload
%autoreload 2

### Importing Libraries

In [2]:
import sys
# append the path to the logs directory
sys.path.append('../logs')
sys.path.append('../src')

In [3]:
# inbuild packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
import logging
from pathlib import Path
import joblib
# from ucimlrepo import fetch_ucirepo

In [4]:
# user defined modules
import utils.load_data as load_data

### Load Data

In [5]:
data_file_path = '../data/raw/air_quality.csv'
data = load_data.load_data_as_pandas_df(data_file_path)
data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,3/10/2004,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
1,3/10/2004,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
2,3/10/2004,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
3,3/10/2004,21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
4,3/10/2004,22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888


In [7]:
data.shape

(9357, 15)

### Feature selection and IDA

In [6]:
# Select features
features = data[['CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)']]

In [7]:
# Drop rows with missing values (-200)
features = features.replace(-200, np.nan)

In [8]:
features = features.dropna()

print(features.shape)
features.head()

(6941, 4)


Unnamed: 0,CO(GT),C6H6(GT),NOx(GT),NO2(GT)
0,2.6,11.9,166.0,113.0
1,2.0,9.4,103.0,92.0
2,2.2,9.0,131.0,114.0
3,2.2,9.2,172.0,122.0
4,1.6,6.5,131.0,116.0


In [9]:
features.describe()

Unnamed: 0,CO(GT),C6H6(GT),NOx(GT),NO2(GT)
count,6941.0,6941.0,6941.0,6941.0
mean,2.182467,10.554488,250.671949,113.874082
std,1.441158,7.465226,208.611371,47.475017
min,0.1,0.2,2.0,2.0
25%,1.1,4.9,103.0,79.0
50%,1.9,8.8,186.0,110.0
75%,2.9,14.6,335.0,142.0
max,11.9,63.7,1479.0,333.0


### Model Configuration

In [10]:
# Parameters
n_estimators = 100  # Number of trees
contamination = 0.005  # Expected proportion of anomalies
sample_size = 256  # Number of samples used to train each tree

### Model Training

In [12]:
def model_train_save(train_data, n_estimators, contamination, sample_size, model_path):
    # Train the model
    model = IsolationForest(n_estimators=n_estimators, contamination=contamination, random_state=0, max_samples=sample_size)
    model.fit(train_data)

    # Save the model
    model_path = Path(model_path)
    model_path.mkdir(parents=True, exist_ok=True)
    with open(model_path / "modelname.joblib", "wb") as f:
        joblib.dump(model, f)

    return model

In [13]:
params = {}
params['n_estimators'] = [100, 150]
params['contaminations'] = [0.005, 0.01]
params['sample_size'] = [256]


In [14]:
params

{'n_estimators': [100, 150],
 'contaminations': [0.005, 0.01],
 'sample_size': [256]}