In [1]:
import pandas as pd
import pathlib
from loguru import logger

import optuna
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

from src.dataset import DatasetLoader
from src.dataset import DatasetEnum
from src.pipeline import Pipeline
from src.tuning import objective

import warnings

warnings.filterwarnings("ignore", message="is_categorical_dtype is deprecated")
warnings.filterwarnings("ignore", message="is_sparse is deprecated")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_loader = DatasetLoader(pathlib.Path("./datasets"))

data = data_loader.load_dataset(DatasetEnum.water)
data, target = data.drop(columns=["target"]), data[["target"]]

data.head(3)

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,viruses,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium
0,1.65,9.08,0.04,2.85,0.007,0.35,0.83,0.17,0.05,0.2,0.0,0.054,16.08,1.13,0.007,37.75,6.78,0.08,0.34,0.02
1,2.32,21.16,0.01,3.31,0.002,5.28,0.68,0.66,0.9,0.65,0.65,0.1,2.01,1.93,0.003,32.26,3.21,0.08,0.27,0.05
2,1.01,14.02,0.04,0.58,0.008,4.24,0.53,0.02,0.99,0.05,0.003,0.078,14.16,1.11,0.006,50.28,7.07,0.07,0.44,0.01


In [3]:
model_params = {
    "CatBoost": {"silent": True},
    "XGBoost": {"enable_categorical": True},
    "KNN": {},
    "RandomForest": {},
    "LogRegression": {"max_iter": 10000},
    "DecisionTree": {},
}

for model_name, params in model_params.items():
    logger.info(f"Model {model_name}")

    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(),
        study_name="Choose models",
    )
    logger.info("Start hyperparams optimization")
    study.optimize(
        lambda trial: objective(trial, data, target, model_name),
        catch=(Exception,),
        gc_after_trial=True,
        n_trials=30,
    )
    params.update(study.best_params)

[32m2023-11-14 03:47:28.097[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mModel CatBoost[0m
[I 2023-11-14 03:47:28,099] A new study created in memory with name: Choose models
[32m2023-11-14 03:47:28.099[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mStart hyperparams optimization[0m
[32m2023-11-14 03:47:28.103[0m | [1mINFO    [0m | [36msrc.tuning.objective[0m:[36mobjective[0m:[36m25[0m - [1mSelected config: {'iterations': 275, 'depth': 9, 'learning_rate': 0.0019132698151981015, 'l2_leaf_reg': 0.6380520463424703, 'colsample_bylevel': 0.5554886851110005, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.558081009180052}[0m
[I 2023-11-14 03:48:31,940] Trial 0 finished with value: 0.9367742990137585 and parameters: {'iterations': 275, 'depth': 9, 'learning_rate': 0.0019132698151981015, 'l2_leaf_reg': 0.6380520463424703, 'colsample_bylevel': 0.5554886851110005, 'bootstrap_type': 'Bayesian', 'bagging_temper

In [None]:
METRIC = roc_auc_score
NUM_FOLDS = 5
STATE = 101

fold_generator = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=STATE)

models_dct = {
    "CatBoost": CatBoostClassifier(**model_params["CatBoost"]),
    "XGBoost": XGBClassifier(**model_params["XGBoost"]),
    "MostFrequent": DummyClassifier(strategy="most_frequent"),
    "KNN": KNeighborsClassifier(**model_params["KNN"]),
    "NaiveBayes": GaussianNB(),
    "RandomForest": RandomForestClassifier(**model_params["RandomForest"]),
    "LogRegression": LogisticRegression(**model_params["LogRegression"]),
    "DecisionTree": DecisionTreeClassifier(**model_params["DecisionTree"]),
}

In [None]:
summary_list = []

for model_name, model in models_dct.items():
    logger.info(f"Model: {model_name}")
    pipeline = Pipeline(base_model=model)

    folds = fold_generator.split(data, y=target)
    for i, (train_fold_idx, test_fold_idx) in enumerate(folds):
        logger.info(f"Fold: {i + 1} / {NUM_FOLDS}")

        train_data, train_target = data.iloc[train_fold_idx], target.iloc[train_fold_idx]
        test_data, test_target = data.iloc[test_fold_idx], target.iloc[test_fold_idx]
        
        pipeline.fit(train_data, train_target)
        predictions = pipeline.predict_proba(test_data)[:, 1]

        metric_value = METRIC(test_target, predictions)

        result_dict = {
            "fold": i,
            "model": model_name,
            "metric_value": metric_value
        }
        summary_list.append(result_dict)

summary = pd.DataFrame.from_records(summary_list)
summary.columns = ["fold", "model", "ROC AUC"]

summary

[32m2023-11-14 03:24:07.962[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mModel: XGBoost[0m
[32m2023-11-14 03:24:07.965[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mFold: 1 / 5[0m
[32m2023-11-14 03:24:08.065[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mFold: 2 / 5[0m
[32m2023-11-14 03:24:08.150[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mFold: 3 / 5[0m
[32m2023-11-14 03:24:08.232[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mFold: 4 / 5[0m
[32m2023-11-14 03:24:08.314[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mFold: 5 / 5[0m


Unnamed: 0,fold,model,ROC AUC
0,0,XGBoost,0.925738
1,1,XGBoost,0.905707
2,2,XGBoost,0.915424
3,3,XGBoost,0.908283
4,4,XGBoost,0.906406
