In [1]:
import pandas as pd
import numpy as np
import pathlib
import loguru

from src.dataset import DatasetLoader
from src.dataset import DatasetEnum
from src.pipeline import ModelPipeline

In [2]:
data_loader = DatasetLoader(pathlib.Path("./datasets"))

data = data_loader.load_dataset(DatasetEnum.heart)
features = [col_name for col_name in data.columns if col_name != "target"]

data.head(3)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,target
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0


In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold

models_dct = {
    "CatBoost": CatBoostClassifier(silent=True),
    "XGBoost": XGBClassifier(enable_categorical=True),
    "MostFrequent": DummyClassifier(strategy="most_frequent"),
    "KNN": KNeighborsClassifier(),
    "NaiveBayes": GaussianNB(),
    "RandomForest": RandomForestClassifier(n_estimators=500),
    "LogRegression": LogisticRegression(max_iter=10000),
}

metrics_list = [("score", roc_auc_score), ("binary", accuracy_score), ("binary", f1_score)]

In [4]:
import warnings

warnings.filterwarnings("ignore", message="is_categorical_dtype is deprecated")
warnings.filterwarnings("ignore", message="is_sparse is deprecated")

# To avoid different folds separately on each dataset
num_folds = 5
fold_generator = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=101)

summary_list = []

for model_name, model in models_dct.items():
    loguru.logger.info(f"Model: {model_name}")
    pipeline = ModelPipeline(base_model=model, features=features, metrics=metrics_list)

    folds = fold_generator.split(data, y=data["target"])
    for i, (train_fold_idx, test_fold_idx) in enumerate(folds):
        loguru.logger.info(f"Fold: {i + 1} / {num_folds}")
        train_fold = data.iloc[train_fold_idx]
        test_fold = data.iloc[test_fold_idx]
        
        pipeline.fit(train_fold)
        predictions = pipeline.predict(test_fold)

        metrics = pipeline.calculate_metrics(test_fold, predictions)

        result_dict = {
            "fold": i,
            "model": model_name,
        }
        result_dict.update(metrics)
        summary_list.append(result_dict)

summary = pd.DataFrame.from_records(summary_list)
summary

[32m2023-10-06 03:16:14.498[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mModel: CatBoost[0m
[32m2023-10-06 03:16:14.505[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mFold: 1 / 5[0m


[32m2023-10-06 03:16:18.914[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mFold: 2 / 5[0m
[32m2023-10-06 03:16:23.132[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mFold: 3 / 5[0m
[32m2023-10-06 03:16:27.041[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mFold: 4 / 5[0m
[32m2023-10-06 03:16:31.167[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mFold: 5 / 5[0m
[32m2023-10-06 03:16:35.351[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mModel: XGBoost[0m
[32m2023-10-06 03:16:35.355[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mFold: 1 / 5[0m
[32m2023-10-06 03:16:35.554[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mFold: 2 / 5[0m
[32m2023-10-06 03:16:35.699[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mFold: 3 / 5[0m
[32m

Unnamed: 0,fold,model,<function roc_auc_score at 0x7f0adc0ba5f0>,<function accuracy_score at 0x7f0adc4efbe0>,<function f1_score at 0x7f0adc0b8430>
0,0,CatBoost,0.962099,0.902174,0.912621
1,1,CatBoost,0.932449,0.875,0.8867
2,2,CatBoost,0.920493,0.869565,0.886792
3,3,CatBoost,0.923328,0.852459,0.866995
4,4,CatBoost,0.92526,0.846995,0.866667
5,0,XGBoost,0.948948,0.88587,0.897561
6,1,XGBoost,0.914395,0.847826,0.862745
7,2,XGBoost,0.915232,0.875,0.889952
8,3,XGBoost,0.912461,0.857923,0.875
9,4,XGBoost,0.917653,0.863388,0.883721
