In [1]:
import pandas as pd
import numpy as np
import pathlib

from src.dataset import DatasetLoader
from src.dataset import DatasetEnum

In [2]:
data_loader = DatasetLoader(pathlib.Path("./datasets"))

data_loader.load_dataset(DatasetEnum.weather).dtypes

Date             category
Location         category
MinTemp           float64
MaxTemp           float64
Rainfall          float64
Evaporation       float64
Sunshine          float64
WindGustDir      category
WindGustSpeed     float64
WindDir9am       category
WindDir3pm       category
WindSpeed9am      float64
WindSpeed3pm      float64
Humidity9am       float64
Humidity3pm       float64
Pressure9am       float64
Pressure3pm       float64
Cloud9am          float64
Cloud3pm          float64
Temp9am           float64
Temp3pm           float64
RainToday        category
RainTomorrow     category
dtype: object

In [3]:
data_loader.load_dataset(DatasetEnum.weather)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


In [32]:

for col_name in heart_failure.columns:
    if heart_failure[col_name].dtypes == "object":
        heart_failure[col_name] = heart_failure[col_name].astype("category")

In [48]:
datasets = {
    "heart": heart_failure,
    "mushrooms": mushrooms,
    "water": water,
}

In [82]:
data_name = "heart"

In [83]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from catboost import CatBoostClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier


import warnings 

warnings.filterwarnings('ignore')


summary_list = []

data = datasets[data_name]
cat_features = [col_name for col_name in data.columns if data[col_name].dtypes == "category"]

# To avoid different folds separately on each dataset
fold_generator = StratifiedKFold(n_splits=5, shuffle=True, random_state=101)

folds = fold_generator.split(data, y=data["target"])
for i, (train_fold_idx, test_fold_idx) in enumerate(folds):
    train_fold = data.iloc[train_fold_idx, :]
    test_fold = data.iloc[test_fold_idx, :]

    models = {
        "CatBoost": CatBoostClassifier(silent=True, cat_features=cat_features),
        # "KNN": KNeighborsClassifier(),
        "MostFrequent": DummyClassifier(strategy="most_frequent"),
        # "NaiveBayes": GaussianNB(),
        # "RandomForest": RandomForestClassifier(),
        "XGBoost": XGBClassifier(enable_categorical=True),
    }

    for sel_model_repr, sel_model in models.items():
        pipeline = ModelPipeline(base_model=sel_model, metrics=[roc_auc_score])
        
        pipeline.fit(train_fold)
        predictions = pipeline.predict(test_fold)

        metrics = pipeline.calculate_metrics(test_fold, predictions)

        result_dict = {
            "fold": i,
            "model": sel_model_repr,
        }
        result_dict.update(metrics)
        summary_list.append(result_dict)

summary = pd.DataFrame.from_records(summary_list)
summary

Unnamed: 0,fold,model,<function roc_auc_score at 0x7f1b7f5af1c0>
0,0,CatBoost,0.962099
1,0,MostFrequent,0.5
2,0,XGBoost,0.948948
3,1,CatBoost,0.932449
4,1,MostFrequent,0.5
5,1,XGBoost,0.914395
6,2,CatBoost,0.920493
7,2,MostFrequent,0.5
8,2,XGBoost,0.915232
9,3,CatBoost,0.923328


In [5]:
import pathlib
import pandas as pd


import pathlib
from abc import ABC
from abc import abstractmethod
from enum import Enum, auto
from typing import Dict
from typing import List

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator

class Dataset(Enum):
    """Enum for datasets."""

    mushrooms = auto()
    water = auto()
    heart = auto()

    @classmethod
    def _missing_(cls, value):
        raise NotImplementedError(
            f"{value} is not a valid {cls.__name__}. Only {', '.join([repr(m.value) for m in cls])} types are allowed"
        )
    
Dataset.mushrooms.

AttributeError: 'Dataset' object has no attribute 'isinstance'