In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV

from src.paths import PREPROCESSED_DATA_DIR, MODEL_DIR

import pickle

In [80]:
# read training data
data_train = pd.read_pickle(PREPROCESSED_DATA_DIR / 'data_train.pkl')
display(
    data_train.groupby('Year')['WnvPresent'].agg(['count','mean'])
)
labels = data_train.pop('WnvPresent')

Year
2007    0.099951
2009    0.015228
2011    0.042925
2013    0.148417
Name: WnvPresent, dtype: float64

In [73]:
from sklearn.model_selection._split import _UnsupportedGroupCVMixin, _BaseKFold


class MyKFold(_UnsupportedGroupCVMixin, _BaseKFold):
    """K-Fold cross-validator.

    Provides train/test indices to split data in train/test sets. Split
    dataset into k consecutive folds (without shuffling by default).

    Each fold is then used once as a validation while the k - 1 remaining
    folds form the training set.

    Read more in the :ref:`User Guide <k_fold>`.

    For visualisation of cross-validation behaviour and
    comparison between common scikit-learn split methods
    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`

    Parameters
    ----------
    n_splits : int, default=5
        Number of folds. Must be at least 2.

        .. versionchanged:: 0.22
            ``n_splits`` default value changed from 3 to 5.

    shuffle : bool, default=False
        Whether to shuffle the data before splitting into batches.
        Note that the samples within each split will not be shuffled.

    random_state : int, RandomState instance or None, default=None
        When `shuffle` is True, `random_state` affects the ordering of the
        indices, which controls the randomness of each fold. Otherwise, this
        parameter has no effect.
        Pass an int for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.
    """

    def __init__(self, split_labels=None, n_splits=4, *, shuffle=False, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
        self.split_labels = split_labels

    def _iter_test_indices(self, X, y=None, groups=None):
        years = self.split_labels
        for year in years.unique():
            yield years[years==year].index


In [78]:
data_train.Year.value_counts()

Year
2007    2061
2013    1516
2011    1258
2009    1182
Name: count, dtype: int64

In [74]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate

cv = MyKFold(split_labels=data_train['Year'])
clf = XGBClassifier()
res = cross_validate(clf, data_train.drop('Year', axis=1), labels, scoring='roc_auc', cv=cv)
res


{'fit_time': array([0.45503283, 0.11404991, 0.146209  , 0.15220261]),
 'score_time': array([0.02052641, 0.01153541, 0.01579809, 0.01486421]),
 'test_score': array([0.63716275, 0.54734632, 0.67381106, 0.67201136])}

In [97]:
import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_validate

from src.config import UNDERSAMPLE


def objective(trial):
    model = trial.suggest_categorical('model', ['lgbm', 'xgb'])
    if model == 'lgbm':
        params = {
            'max_depth': trial.suggest_int('max_depth', 2, 10),
            "min_child_weight": trial.suggest_categorical(
                'min_child_weight', [1, 3, 5]
            ),
            "subsample": trial.suggest_float('subsample', 0.5, 1.0),
            "learning_rate": trial.suggest_float(
                'learning_rate', 1e-4, 10, log=True
            ),
            "reg_lambda": trial.suggest_float('reg_lambda', 0.01, 10),
            "scale_pos_weight": 1 if UNDERSAMPLE else 9
        }
        clf = LGBMClassifier(**params, verbose=-1)
    else:
        params = {
            'max_depth': trial.suggest_int('max_depth', 2, 10),
            "min_child_weight": trial.suggest_categorical(
                'min_child_weight', [1, 3, 5]
            ),
            "subsample": trial.suggest_float('subsample', 0.5, 1.0),
            "eta": trial.suggest_float('eta', 1e-4, 10, log=True),
            "lambda": trial.suggest_float('lambda', 0.01, 10),
            "scale_pos_weight": 1 if UNDERSAMPLE else 9
        }
        clf = XGBClassifier(**params)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
    res = cross_validate(clf, data_train, labels, scoring='roc_auc', cv=cv)
    return res['test_score'].mean()


def train_best(best_params, data_train, labels):
    model = best_params.pop('model', None)

    if model == 'lgbm':
        clf = LGBMClassifier(**best_params)
    else:
        clf = XGBClassifier(**best_params)

    clf.fit(data_train, labels)

    return clf


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)

[I 2024-09-09 13:33:40,483] A new study created in memory with name: no-name-604c1fa0-e53f-4d5d-909d-117c9669d3bf


[I 2024-09-09 13:33:42,468] Trial 0 finished with value: 0.7835031165523814 and parameters: {'model': 'lgbm', 'max_depth': 6, 'min_child_weight': 3, 'subsample': 0.9051046782239474, 'learning_rate': 0.0003636395575142002, 'reg_lambda': 3.1101715747100034}. Best is trial 0 with value: 0.7835031165523814.
[I 2024-09-09 13:33:44,165] Trial 1 finished with value: 0.791265675185625 and parameters: {'model': 'lgbm', 'max_depth': 7, 'min_child_weight': 3, 'subsample': 0.6514220999104288, 'learning_rate': 0.001149003959133305, 'reg_lambda': 3.707680968860433}. Best is trial 1 with value: 0.791265675185625.
[I 2024-09-09 13:33:47,757] Trial 2 finished with value: 0.7487894939015172 and parameters: {'model': 'xgb', 'max_depth': 10, 'min_child_weight': 5, 'subsample': 0.8466829403851388, 'eta': 0.7700033913654947, 'lambda': 4.177313256857171}. Best is trial 1 with value: 0.791265675185625.
[I 2024-09-09 13:33:50,384] Trial 3 finished with value: 0.7875980418460669 and parameters: {'model': 'xgb',

In [92]:
study.best_params

{'model': 'lgbm',
 'max_depth': 5,
 'min_child_weight': 3,
 'subsample': 0.7806100204107996,
 'learning_rate': 0.016513356755168105,
 'reg_lambda': 4.438555879997668}

In [93]:
study.best_trial

FrozenTrial(number=27, state=TrialState.COMPLETE, values=[0.8082257504743751], datetime_start=datetime.datetime(2024, 9, 9, 13, 24, 1, 190017), datetime_complete=datetime.datetime(2024, 9, 9, 13, 24, 2, 685139), params={'model': 'lgbm', 'max_depth': 5, 'min_child_weight': 3, 'subsample': 0.7806100204107996, 'learning_rate': 0.016513356755168105, 'reg_lambda': 4.438555879997668}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'model': CategoricalDistribution(choices=('lgbm', 'xgb')), 'max_depth': IntDistribution(high=10, log=False, low=2, step=1), 'min_child_weight': CategoricalDistribution(choices=(1, 3, 5)), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'learning_rate': FloatDistribution(high=10.0, log=True, low=0.0001, step=None), 'reg_lambda': FloatDistribution(high=10.0, log=False, low=0.01, step=None)}, trial_id=27, value=None)

In [94]:
study.best_value

0.8082257504743751

In [95]:
best_model = train_best(study.best_params, data_train, labels)

with open(MODEL_DIR / 'best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
