### Auroreload Functionality

In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
import sys

In [34]:
sys.path.append('../logs')

### Importing standard libraries

In [4]:
# inbuild packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ML packages
from sklearn.ensemble import IsolationForest
import logging
from pathlib import Path
import joblib
# from ucimlrepo import fetch_ucirepo

### Configure logging

In [37]:
# Configure logging
logging.basicConfig(
    filename="app.log",  # Log file name
    level=logging.INFO,  # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format="%(asctime)s - %(levelname)s - %(message)s",  # Log message format
)

# Example log messages
logging.info("This is an info message")
logging.warning("This is a warning")
logging.error("This is an error")

ERROR:root:This is an error


In [None]:
CONFIG.LOGLEVEL = 

In [38]:
def setup_logger():
    """Setup default logging, call at the start of program."""
    log_handler = logging.StreamHandler()
    logging.basicConfig(
        format="[{asctime},{msecs:03.0f}] {levelname} {name}.{lineno}| {message}",
        datefmt="%H:%M:%S",
        style="{",
        handlers=[log_handler],
    )
    logging.getLogger("app").setLevel(CONFIG.LOGLEVEL)

In [1]:
# setup_logger()

### Load Data

In [9]:
data = pd.read_csv('../data_training/air_quality.csv')


In [10]:
data.shape

(9357, 15)

In [11]:
data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,3/10/2004,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
1,3/10/2004,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
2,3/10/2004,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
3,3/10/2004,21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
4,3/10/2004,22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888


### Data Preprocessing

In [12]:
# Select features
features = data[['CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)']]

In [13]:
# Drop rows with missing values (-200)
features = features.replace(-200, np.nan)
features = features.dropna()

print(features.shape)
features.head()

(6941, 4)


Unnamed: 0,CO(GT),C6H6(GT),NOx(GT),NO2(GT)
0,2.6,11.9,166.0,113.0
1,2.0,9.4,103.0,92.0
2,2.2,9.0,131.0,114.0
3,2.2,9.2,172.0,122.0
4,1.6,6.5,131.0,116.0


### Model Configuration

In [15]:
# Parameters
n_estimators = 100  # Number of trees
contamination = 0.005  # Expected proportion of anomalies
sample_size = 256  # Number of samples used to train each tree

### Model Training

In [17]:
# Train Isolation Forest
iso_forest = IsolationForest(n_estimators=n_estimators,
                            contamination=contamination,
                            max_samples=sample_size,
                            random_state=42)
iso_forest.fit(features)

# save model

In [58]:
storage_path = 'data_scoring'
storage_path = Path(storage_path)

In [60]:
storage_path.mkdir(parents=True, exist_ok=True)

In [62]:
with open(storage_path / "modelname.joblib", "wb") as f:
    joblib.dump(iso_forest, f)