In [1]:
import pandas as pd
import numpy as np
import pathlib
from loguru import logger

import optuna
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

from src.dataset import DatasetLoader
from src.dataset import DatasetEnum
from src.pipeline import Pipeline
from src.tuning import objective

import warnings

warnings.filterwarnings("ignore", message="is_categorical_dtype is deprecated")
warnings.filterwarnings("ignore", message="is_sparse is deprecated")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_loader = DatasetLoader(pathlib.Path("./datasets"))

data = data_loader.load_dataset(DatasetEnum.weather)
data, target = data.drop(columns=["target", "Date", "Location", "RainToday"]), data[["target"]]

for col_name in data.columns:
    if data[col_name].dtypes == "float" and np.all(data[~data[col_name].isna()][col_name] % 1.0 == 0.0):
        data[col_name] = data[col_name].fillna(np.round(np.mean(data[col_name]))).astype("int32")
    elif data[col_name].dtypes == "float":
        data[col_name] = data[col_name].fillna(np.mean(data[col_name])).astype("float64")
    elif data[col_name].dtypes == "category":
        data[col_name] = data[col_name].astype("str").fillna("unk").astype("category")

data.head(3)

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
36974,8.4,25.3,0.0,5.4,10.6,SW,31,W,WSW,7,19,76,30,1020.5,1018.6,1,1,14.4,24.3
141221,17.5,30.7,0.0,5.0,10.4,ESE,30,SSE,NNW,15,20,68,39,1013.7,1010.4,6,6,22.3,29.6
51663,6.2,17.5,0.0,5.463885,7.652609,WSW,57,WNW,WSW,28,22,68,35,1014.3,1015.4,4,4,15.7,14.7


In [3]:
model_params = {
    "CatBoost": {"silent": True},
    "XGBoost": {"enable_categorical": True},
    "KNN": {},
    "RandomForest": {},
    "LogRegression": {"max_iter": 10000},
    "DecisionTree": {},
}

for model_name, params in model_params.items():
    logger.info(f"Model {model_name}")

    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(),
        study_name="Choose models",
    )
    logger.info("Start hyperparams optimization")
    study.optimize(
        lambda trial: objective(trial, data, target, model_name),
        catch=(Exception,),
        gc_after_trial=True,
        n_trials=30,
    )
    params.update(study.best_params)

[32m2023-11-14 03:47:30.623[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mModel CatBoost[0m
[I 2023-11-14 03:47:30,626] A new study created in memory with name: Choose models
[32m2023-11-14 03:47:30.652[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mStart hyperparams optimization[0m
[32m2023-11-14 03:47:30.655[0m | [1mINFO    [0m | [36msrc.tuning.objective[0m:[36mobjective[0m:[36m25[0m - [1mSelected config: {'iterations': 250, 'depth': 1, 'learning_rate': 0.015949551377530822, 'l2_leaf_reg': 0.5065215368982904, 'colsample_bylevel': 0.18293613102607156, 'bootstrap_type': 'MVS'}[0m
[I 2023-11-14 03:47:45,122] Trial 0 finished with value: 0.8348838454108573 and parameters: {'iterations': 250, 'depth': 1, 'learning_rate': 0.015949551377530822, 'l2_leaf_reg': 0.5065215368982904, 'colsample_bylevel': 0.18293613102607156, 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.8348838454108573.
[32m2023-11-14 03

In [None]:
METRIC = roc_auc_score
NUM_FOLDS = 5
STATE = 101

fold_generator = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=STATE)

models_dct = {
    "CatBoost": CatBoostClassifier(**model_params["CatBoost"]),
    "XGBoost": XGBClassifier(**model_params["XGBoost"]),
    "MostFrequent": DummyClassifier(strategy="most_frequent"),
    "KNN": KNeighborsClassifier(**model_params["KNN"]),
    "NaiveBayes": GaussianNB(),
    "RandomForest": RandomForestClassifier(**model_params["RandomForest"]),
    "LogRegression": LogisticRegression(**model_params["LogRegression"]),
    "DecisionTree": DecisionTreeClassifier(**model_params["DecisionTree"]),
}

In [None]:
summary_list = []

for model_name, model in models_dct.items():
    logger.info(f"Model: {model_name}")
    pipeline = Pipeline(base_model=model)

    folds = fold_generator.split(data, y=target)
    for i, (train_fold_idx, test_fold_idx) in enumerate(folds):
        logger.info(f"Fold: {i + 1} / {NUM_FOLDS}")

        train_data, train_target = data.iloc[train_fold_idx], target.iloc[train_fold_idx]
        test_data, test_target = data.iloc[test_fold_idx], target.iloc[test_fold_idx]
        
        pipeline.fit(train_data, train_target)
        predictions = pipeline.predict_proba(test_data)[:, 1]

        metric_value = METRIC(test_target, predictions)

        result_dict = {
            "fold": i,
            "model": model_name,
            "metric_value": metric_value
        }
        summary_list.append(result_dict)

summary = pd.DataFrame.from_records(summary_list)
summary.columns = ["fold", "model", "ROC AUC"]

summary

[32m2023-11-14 03:24:07.962[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mModel: XGBoost[0m
[32m2023-11-14 03:24:07.965[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mFold: 1 / 5[0m
[32m2023-11-14 03:24:08.065[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mFold: 2 / 5[0m
[32m2023-11-14 03:24:08.150[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mFold: 3 / 5[0m
[32m2023-11-14 03:24:08.232[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mFold: 4 / 5[0m
[32m2023-11-14 03:24:08.314[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mFold: 5 / 5[0m


Unnamed: 0,fold,model,ROC AUC
0,0,XGBoost,0.925738
1,1,XGBoost,0.905707
2,2,XGBoost,0.915424
3,3,XGBoost,0.908283
4,4,XGBoost,0.906406
