In [27]:
import pandas as pd

import mlflow
from mlflow import sklearn as mlflow_sklearn

import sklearn.datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import catboost
import lightgbm
import xgboost

In [2]:
import numpy as np
import six
from collections import defaultdict
from scipy import sparse

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals.joblib import Parallel, delayed
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline, FeatureUnion, _fit_transform_one, _transform_one

import category_encoders as ce

In [3]:
class PandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(trans, X, y, weight,
                                        **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans, X, None, weight)
            for name, trans, weight in self._iter())
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

In [4]:
def _name_estimators(estimators):
    """Generate names for estimators."""

    names = [type(estimator).__name__.lower() for estimator in estimators]
    namecount = defaultdict(int)
    for est, name in zip(estimators, names):
        namecount[name] += 1

    for k, v in list(six.iteritems(namecount)):
        if v == 1:
            del namecount[k]

    for i in reversed(range(len(estimators))):
        name = names[i]
        if name in namecount:
            names[i] += "-%d" % namecount[name]
            namecount[name] -= 1

    return list(zip(names, estimators))

In [5]:
def make_pandas_union(*transformers, **kwargs):
    n_jobs = kwargs.pop('n_jobs', None)
    if kwargs:
        raise TypeError('Unknown keyword arguments: "{}"'
                        .format(list(kwargs.keys())[0]))
    return PandasFeatureUnion(_name_estimators(transformers), n_jobs=n_jobs)

In [6]:
class OrdinalEncoderPandas(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns
        self.transformers = {}
    
    def fit(self, X, y=None):
        for column in self.columns:
            self.transformers[column] = ce.OrdinalEncoder(return_df=False, handle_unknown="impute").fit(X[[column]])
        return self
    
    def transform(self, X, y=None):
        X = X.drop(list(set(X.columns) - set(self.columns)), axis=1)
        for column in self.columns:
            X[column] = self.transformers[column].transform(X[[column]])
            X[column] = X[column].apply(lambda x: x if x else -1)
        return X

In [7]:
class OneHotEncoderPandas(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns
        self.transformers = {}
        self.feature_names = {}
        self.feature_names_all = []

    def fit(self, X, y=None):
        for column in self.columns:
            self.transformers[column] = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(X[[column]])
            features = [f"{column}_{i}" for i in self.transformers[column].get_feature_names()]
            self.feature_names[column] = features
            for feature in features:
                self.feature_names_all.append(feature)
        return self

    def transform(self, X, y=None):
        ohe_df_list = []
        for column in self.columns:
            ohe_df = pd.DataFrame(self.transformers[column].transform(X[[column]]))
            feature_names = [f"{column}_{i}" for i in self.transformers[column].get_feature_names()]
            ohe_df.columns = self.feature_names[column]
            ohe_df_list.append(ohe_df)
        ohe_df_concat = pd.concat(ohe_df_list, axis=1)
        return ohe_df_concat

In [8]:
class DropColumn(TransformerMixin, BaseEstimator):
    def __init__(self, columns, no_drops):
        self.columns = columns
        self.no_drops = no_drops

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for column in self.columns:
            if column in X.columns:
                drop_together = False
                if self.no_drops:
                    for no_drop in self.no_drops:
                        if column == no_drop and self.no_drops[no_drop] not in X.columns:
                            drop_together = True
                if not drop_together:
                    X = X.drop(columns=column)
            else:
                print(f"Drop Warning: Column {column} not in X")
        return X

In [9]:
class ChangeColumnType(TransformerMixin, BaseEstimator):
    def __init__(self, types):
        self.types = types

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for column in self.types.keys():
            if column in X.columns:
                X[column] = X[column].astype(self.types[column])
            else:
                print(f"Change Warning: Column {column} not in X")
        return X

## Notes:
- XGBoost/Random Forest requires OHE of categorical variables unless categorial variable is ordinal
- XGBoost/LightGBM/Catboost supports missing variables, but RandomForests do not (most models in sklearn does not support missing values)

### Synthetic dataset with features 0 to 3 (4 feature in total) categorical type

In [10]:
samples = sklearn.datasets.make_classification(n_samples=100000, scale=10, random_state=0)

In [11]:
feature_names = [f"feature_{i}" for i in range(samples[0].shape[1])]
target = "target"

In [12]:
df_samples = pd.concat([pd.DataFrame(samples[0], columns=feature_names), pd.DataFrame(samples[1], columns=[target])], axis=1)

In [13]:
categorical_features = list(df_samples.columns[0:4])

In [14]:
for i in categorical_features:
    df_samples[i] = abs(df_samples[i]).astype(int).astype(str)

In [15]:
df_samples.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,12,4,10,2,0.645832,-23.556619,14.920084,-4.116929,-9.131104,2.125107,...,2.361506,-0.189242,5.739019,-2.016198,16.778215,9.035596,-7.726635,-3.518649,16.847575,0
1,8,4,5,8,-0.972505,6.832732,-8.335915,8.297369,7.750135,18.976983,...,23.377719,-18.228802,-2.224554,-0.25129,-7.388667,0.141325,-4.956575,1.528967,10.877658,1
2,4,2,1,2,20.891787,-4.344791,2.434333,-15.868354,7.16975,1.856554,...,-8.959649,-0.403657,3.37873,4.083821,-10.070729,1.227475,-14.536854,-4.519916,-3.603694,1
3,20,3,1,16,-4.322641,-7.138765,2.78531,7.352866,-23.127705,3.468762,...,7.242841,14.721356,-5.952227,-4.674254,-2.79513,-4.940003,12.700518,-5.517182,-2.99683,1
4,8,5,1,7,3.140241,11.670064,11.100301,2.774834,0.150274,-11.49054,...,-4.888,-3.621434,4.478216,3.600383,-11.334051,-2.805697,14.555254,-2.763186,3.916138,1


In [16]:
df_samples[target].value_counts()

1    50050
0    49950
Name: target, dtype: int64

### Pipeline for Models that does not support categorical variables

Need to use OHE, only when you know the data is not ordinal, otherwise you can use ordinal encoding

XGBoost, RandomForest (CART can handle categorical, but RF does not have this implemented)

In [17]:
pipe_ohe = make_pipeline(
    make_pandas_union(
        DropColumn(columns=categorical_features, no_drops=None),
        make_pipeline(
            ChangeColumnType(types={i: str for i in categorical_features}),
            OneHotEncoderPandas(columns=categorical_features)
        )
    )
)

In [18]:
pipe_ohe.fit_transform(df_samples).head(5)

Unnamed: 0,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,...,feature_3_x0_40,feature_3_x0_41,feature_3_x0_42,feature_3_x0_43,feature_3_x0_44,feature_3_x0_5,feature_3_x0_6,feature_3_x0_7,feature_3_x0_8,feature_3_x0_9
0,0.645832,-23.556619,14.920084,-4.116929,-9.131104,2.125107,-5.604669,2.361506,-0.189242,5.739019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.972505,6.832732,-8.335915,8.297369,7.750135,18.976983,-0.398073,23.377719,-18.228802,-2.224554,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,20.891787,-4.344791,2.434333,-15.868354,7.16975,1.856554,13.228155,-8.959649,-0.403657,3.37873,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-4.322641,-7.138765,2.78531,7.352866,-23.127705,3.468762,-10.794777,7.242841,14.721356,-5.952227,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.140241,11.670064,11.100301,2.774834,0.150274,-11.49054,6.220968,-4.888,-3.621434,4.478216,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [19]:
ohe_columns = list(set(pipe_ohe.fit_transform(df_samples).columns) - set([target]))

### Pipeline for models that support categorical variables

LightGBM, CatBoost can handle categorical variables

In [20]:
pipe_cat = make_pipeline(
    make_pandas_union(
        DropColumn(columns=categorical_features, no_drops=None),
        OrdinalEncoderPandas(columns=categorical_features)
    ),
    ChangeColumnType(types={i: "category" for i in categorical_features}),
)

In [21]:
pipe_cat.fit_transform(df_samples).head(5)

Unnamed: 0,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,...,feature_15,feature_16,feature_17,feature_18,feature_19,target,feature_0,feature_1,feature_2,feature_3
0,0.645832,-23.556619,14.920084,-4.116929,-9.131104,2.125107,-5.604669,2.361506,-0.189242,5.739019,...,16.778215,9.035596,-7.726635,-3.518649,16.847575,0,1,1,1,1
1,-0.972505,6.832732,-8.335915,8.297369,7.750135,18.976983,-0.398073,23.377719,-18.228802,-2.224554,...,-7.388667,0.141325,-4.956575,1.528967,10.877658,1,2,1,2,2
2,20.891787,-4.344791,2.434333,-15.868354,7.16975,1.856554,13.228155,-8.959649,-0.403657,3.37873,...,-10.070729,1.227475,-14.536854,-4.519916,-3.603694,1,3,2,3,1
3,-4.322641,-7.138765,2.78531,7.352866,-23.127705,3.468762,-10.794777,7.242841,14.721356,-5.952227,...,-2.79513,-4.940003,12.700518,-5.517182,-2.99683,1,4,3,3,3
4,3.140241,11.670064,11.100301,2.774834,0.150274,-11.49054,6.220968,-4.888,-3.621434,4.478216,...,-11.334051,-2.805697,14.555254,-2.763186,3.916138,1,2,4,3,4


In [22]:
cat_columns = list(pipe_cat.fit_transform(df_samples).select_dtypes(include="category").columns)

## Training/Testing/Validation

In [23]:
df_samples_ohe = pipe_ohe.fit_transform(df_samples)
train_ohe, test_ohe = train_test_split(df_samples_ohe, test_size=0.2, stratify=df_samples_ohe[target], random_state=0)
train_ohe, valid_ohe = train_test_split(train_ohe, test_size=0.2, stratify=train_ohe[target], random_state=0)

In [24]:
df_samples_cat = pipe_cat.fit_transform(df_samples)
train_cat, test_cat = train_test_split(df_samples_cat, test_size=0.2, stratify=df_samples_cat[target], random_state=0)
train_cat, valid_cat = train_test_split(train_cat, test_size=0.2, stratify=train_cat[target], random_state=0)

### Without ML-Flow

#### Random Forest

In [None]:
rf_classifier = RandomForestClassifier(
    criterion='entropy',
    max_features=None,
    n_estimators=20,
    max_depth=4,
    random_state=0,
    n_jobs=4)

In [None]:
rf_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target])
# Use validation set to modify hyperparameters
print(accuracy_score(valid_ohe[target], rf_classifier.predict(valid_ohe[ohe_columns])))
# Use testing set to evaluate final performance
print(accuracy_score(test_ohe[target], rf_classifier.predict(test_ohe[ohe_columns])))

#### XGBoost

In [None]:
xgb_classifier = xgboost.XGBClassifier(
    max_depth=4,
    learning_rate=0.008,
    n_estimators=200
)

In [None]:
xgb_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target])
# Use validation set to modify hyperparameters
print(accuracy_score(valid_ohe[target], xgb_classifier.predict(valid_ohe[ohe_columns])))
# Use testing set to evaluate final performance
print(accuracy_score(test_ohe[target], xgb_classifier.predict(test_ohe[ohe_columns])))

#### LightGBM

In [None]:
lgb_classifier = lightgbm.LGBMClassifier(
    objective="binary",
    categorical_features="auto",
    max_depth=4,
    learning_rate=0.01,
    n_estimators=200
)

In [None]:
lgb_classifier.fit(X=train_cat, y=train_cat[target])
# Use validation set to modify hyperparameters
print(accuracy_score(valid_cat[target], lgb_classifier.predict(valid_cat)))
# Use testing set to evaluate final performance
print(accuracy_score(test_cat[target], lgb_classifier.predict(test_cat)))

#### CatBoost

In [None]:
cat_classifier = catboost.CatBoostClassifier(
    cat_features=cat_columns,
    max_depth=4,
    learning_rate=0.01,
    n_estimators=200,
    verbose=0
)

In [None]:
cat_classifier.fit(X=train_cat, y=train_cat[target])
# Use validation set to modify hyperparameters
print(accuracy_score(valid_cat[target], cat_classifier.predict(valid_cat)))
# Use testing set to evaluate final performance
print(accuracy_score(test_cat[target], cat_classifier.predict(test_cat)))

### With ML-Flow

In [None]:
mlflow.set_experiment("Training/Testing/Validation")

#### Random Forest

In [None]:
with mlflow.start_run(run_name="Random Forest"):
    criterion = "entropy"
    max_features = None
    n_estimators = 20
    max_depth = 4
    
    rf_classifier = RandomForestClassifier(
        criterion=criterion,
        max_features=max_features,
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=0,
        n_jobs=4)
    rf_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target])
    
    valid_accuracy = accuracy_score(valid_ohe[target], rf_classifier.predict(valid_ohe[ohe_columns]))
    test_accuracy = accuracy_score(test_ohe[target], rf_classifier.predict(test_ohe[ohe_columns]))
    
    mlflow.log_param("criterion", criterion)
    mlflow.log_param("max_features", max_features)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    
    mlflow.log_metric("valid_accuracy", valid_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)

    mlflow_sklearn.log_model(rf_classifier, "model")

#### XGBoost

In [None]:
with mlflow.start_run(run_name="XGBoost"):
    n_estimators = 200
    max_depth = 4
    learning_rate = 0.008
    
    xgb_classifier = xgboost.XGBClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators
    )
    xgb_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target])
    
    valid_accuracy = accuracy_score(valid_ohe[target], xgb_classifier.predict(valid_ohe[ohe_columns]))
    test_accuracy = accuracy_score(test_ohe[target], xgb_classifier.predict(test_ohe[ohe_columns]))

    mlflow.log_param("criterion", criterion)
    mlflow.log_param("max_features", max_features)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)

    mlflow.log_metric("valid_accuracy", valid_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)

    mlflow_sklearn.log_model(xgb_classifier, "model")

#### LightGBM

In [None]:
with mlflow.start_run(run_name="LightGBM"):
    objective = "binary"
    categorical_features = "auto"
    n_estimators = 200
    max_depth = 4
    learning_rate = 0.01
    
    lgb_classifier = lightgbm.LGBMClassifier(
        objective=objective,
        categorical_features=categorical_features,
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators
    )
    lgb_classifier.fit(X=train_cat, y=train_cat[target])
    
    valid_accuracy = accuracy_score(valid_cat[target], lgb_classifier.predict(valid_cat))
    test_accuracy = accuracy_score(test_cat[target], lgb_classifier.predict(test_cat))
    
    mlflow.log_param("criterion", criterion)
    mlflow.log_param("max_features", max_features)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    
    mlflow.log_metric("valid_accuracy", valid_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)

    mlflow_sklearn.log_model(lgb_classifier, "model")

#### CatBoost

In [None]:
with mlflow.start_run(run_name="CatBoost"):
    n_estimators = 200
    max_depth = 4
    learning_rate = 0.01
    
    cat_classifier = catboost.CatBoostClassifier(
        cat_features=cat_columns,
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        verbose=0
    )
    cat_classifier.fit(X=train_cat, y=train_cat[target])
    
    valid_accuracy = accuracy_score(valid_cat[target], cat_classifier.predict(valid_cat))
    test_accuracy = accuracy_score(test_cat[target], cat_classifier.predict(test_cat))
    
    mlflow.log_param("criterion", criterion)
    mlflow.log_param("max_features", max_features)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    
    mlflow.log_metric("valid_accuracy", valid_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)

    mlflow_sklearn.log_model(cat_classifier, "model")

## Training/Testing/Validation with Early Stopping

### Without ML-Flow

#### Random Forest

RF does not support early stopping

#### XGBoost

In [78]:
xgb_classifier = xgboost.XGBClassifier(
    max_depth=4,
    learning_rate=0.008,
    n_estimators=2000
)

In [79]:
xgb_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target], early_stopping_rounds=10, eval_metric="logloss", eval_set=[(valid_ohe[ohe_columns], valid_ohe[target])], verbose=True)
# Use validation set to modify hyperparameters
print(accuracy_score(valid_ohe[target], xgb_classifier.predict(valid_ohe[ohe_columns])))
# Use testing set to evaluate final performance
print(accuracy_score(test_ohe[target], xgb_classifier.predict(test_ohe[ohe_columns])))

[0]	validation_0-logloss:0.687582
Will train until validation_0-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.682104
[2]	validation_0-logloss:0.676713
[3]	validation_0-logloss:0.671406
[4]	validation_0-logloss:0.666181
[5]	validation_0-logloss:0.661038
[6]	validation_0-logloss:0.655962
[7]	validation_0-logloss:0.650966
[8]	validation_0-logloss:0.646037
[9]	validation_0-logloss:0.641182
[10]	validation_0-logloss:0.636417
[11]	validation_0-logloss:0.631705
[12]	validation_0-logloss:0.627065
[13]	validation_0-logloss:0.622495
[14]	validation_0-logloss:0.617988
[15]	validation_0-logloss:0.61355
[16]	validation_0-logloss:0.609173
[17]	validation_0-logloss:0.604853
[18]	validation_0-logloss:0.600586
[19]	validation_0-logloss:0.596399
[20]	validation_0-logloss:0.592272
[21]	validation_0-logloss:0.588187
[22]	validation_0-logloss:0.584173
[23]	validation_0-logloss:0.580194
[24]	validation_0-logloss:0.576286
[25]	validation_0-logloss:0.572433
[26]	validation_0-logloss:0.56861

In [80]:
xgb_classifier.best_iteration

1999

#### LightGBM

In [81]:
lgb_classifier = lightgbm.LGBMClassifier(
    objective="binary",
    categorical_features="auto",
    max_depth=4,
    learning_rate=0.01,
    n_estimators=2000
)

In [82]:
lgb_classifier.fit(X=train_cat, y=train_cat[target], early_stopping_rounds=10, eval_metric="logloss", eval_set=[(valid_cat, valid_cat[target])], verbose=True)
# Use validation set to modify hyperparameters
print(accuracy_score(valid_cat[target], lgb_classifier.predict(valid_cat)))
# Use testing set to evaluate final performance
print(accuracy_score(test_cat[target], lgb_classifier.predict(test_cat)))

[1]	valid_0's binary_logloss: 0.683197	valid_0's binary_logloss: 0.683197
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's binary_logloss: 0.673444	valid_0's binary_logloss: 0.673444
[3]	valid_0's binary_logloss: 0.663882	valid_0's binary_logloss: 0.663882
[4]	valid_0's binary_logloss: 0.654506	valid_0's binary_logloss: 0.654506
[5]	valid_0's binary_logloss: 0.64531	valid_0's binary_logloss: 0.64531
[6]	valid_0's binary_logloss: 0.63629	valid_0's binary_logloss: 0.63629
[7]	valid_0's binary_logloss: 0.62744	valid_0's binary_logloss: 0.62744
[8]	valid_0's binary_logloss: 0.618755	valid_0's binary_logloss: 0.618755
[9]	valid_0's binary_logloss: 0.610232	valid_0's binary_logloss: 0.610232
[10]	valid_0's binary_logloss: 0.601865	valid_0's binary_logloss: 0.601865
[11]	valid_0's binary_logloss: 0.593651	valid_0's binary_logloss: 0.593651
[12]	valid_0's binary_logloss: 0.585586	valid_0's binary_logloss: 0.585586
[13]	valid_0's binary_logloss: 0.577665	valid_0's bin

In [83]:
lgb_classifier.best_iteration_

1681

#### CatBoost

In [84]:
cat_classifier = catboost.CatBoostClassifier(
    cat_features=cat_columns,
    max_depth=4,
    learning_rate=0.01,
    n_estimators=2000,
    verbose=0
)

In [85]:
cat_classifier.fit(X=train_cat, y=train_cat[target], early_stopping_rounds=10, eval_set=[(valid_cat, valid_cat[target])], verbose=True)
# Use validation set to modify hyperparameters
print(accuracy_score(valid_cat[target], cat_classifier.predict(valid_cat)))
# Use testing set to evaluate final performance
print(accuracy_score(test_cat[target], cat_classifier.predict(test_cat)))

0:	learn: 0.6485530	test: 0.6485395	best: 0.6485395 (0)	total: 131ms	remaining: 4m 22s
1:	learn: 0.6052926	test: 0.6052993	best: 0.6052993 (1)	total: 195ms	remaining: 3m 14s
2:	learn: 0.5652932	test: 0.5652609	best: 0.5652609 (2)	total: 253ms	remaining: 2m 48s
3:	learn: 0.5270464	test: 0.5270381	best: 0.5270381 (3)	total: 313ms	remaining: 2m 36s
4:	learn: 0.4885589	test: 0.4885510	best: 0.4885510 (4)	total: 359ms	remaining: 2m 23s
5:	learn: 0.4556445	test: 0.4556285	best: 0.4556285 (5)	total: 421ms	remaining: 2m 19s
6:	learn: 0.4243398	test: 0.4243039	best: 0.4243039 (6)	total: 479ms	remaining: 2m 16s
7:	learn: 0.3952628	test: 0.3952247	best: 0.3952247 (7)	total: 534ms	remaining: 2m 12s
8:	learn: 0.3699319	test: 0.3699032	best: 0.3699032 (8)	total: 589ms	remaining: 2m 10s
9:	learn: 0.3445195	test: 0.3444843	best: 0.3444843 (9)	total: 646ms	remaining: 2m 8s
10:	learn: 0.3213396	test: 0.3213186	best: 0.3213186 (10)	total: 704ms	remaining: 2m 7s
11:	learn: 0.2991933	test: 0.2991621	best: 

In [86]:
cat_classifier.get_best_iteration()

1999

### With ML-Flow

In [None]:
mlflow.set_experiment("Training/Testing/Validation with Early Stopping")

#### XGBoost

In [None]:
with mlflow.start_run(run_name="XGBoost"):
    n_estimators = 2000
    max_depth = 4
    learning_rate = 0.008
    
    xgb_classifier = xgboost.XGBClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators
    )
    xgb_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target], early_stopping_rounds=10, eval_metric="logloss", eval_set=[(valid_ohe[ohe_columns], valid_ohe[target])], verbose=True)
    
    valid_accuracy = accuracy_score(valid_ohe[target], xgb_classifier.predict(valid_ohe[ohe_columns]))
    test_accuracy = accuracy_score(test_ohe[target], xgb_classifier.predict(test_ohe[ohe_columns]))

    mlflow.log_param("criterion", criterion)
    mlflow.log_param("max_features", max_features)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("best_iteration", xgb_classifier.best_iteration)

    mlflow.log_metric("valid_accuracy", valid_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)

    mlflow_sklearn.log_model(xgb_classifier, "model")

#### LightGBM

In [None]:
with mlflow.start_run(run_name="LightGBM"):
    objective = "binary"
    categorical_features = "auto"
    n_estimators = 2000
    max_depth = 4
    learning_rate = 0.01
    
    lgb_classifier = lightgbm.LGBMClassifier(
        objective=objective,
        categorical_features=categorical_features,
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators
    )
    lgb_classifier.fit(X=train_cat, y=train_cat[target], early_stopping_rounds=10, eval_metric="logloss", eval_set=[(valid_cat, valid_cat[target])], verbose=True)
    
    valid_accuracy = accuracy_score(valid_cat[target], lgb_classifier.predict(valid_cat))
    test_accuracy = accuracy_score(test_cat[target], lgb_classifier.predict(test_cat))
    
    mlflow.log_param("criterion", criterion)
    mlflow.log_param("max_features", max_features)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("best_iteration", lgb_classifier.best_iteration_)
    
    mlflow.log_metric("valid_accuracy", valid_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)

    mlflow_sklearn.log_model(lgb_classifier, "model")

#### CatBoost

In [None]:
with mlflow.start_run(run_name="CatBoost"):
    n_estimators = 2000
    max_depth = 4
    learning_rate = 0.01
    
    cat_classifier = catboost.CatBoostClassifier(
        cat_features=cat_columns,
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        verbose=0
    )
    cat_classifier.fit(X=train_cat, y=train_cat[target], early_stopping_rounds=10, eval_set=[(valid_cat, valid_cat[target])], verbose=True)
    
    valid_accuracy = accuracy_score(valid_cat[target], cat_classifier.predict(valid_cat))
    test_accuracy = accuracy_score(test_cat[target], cat_classifier.predict(test_cat))
    
    mlflow.log_param("criterion", criterion)
    mlflow.log_param("max_features", max_features)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("best_iteration", lgb_classifier.best_iteration_)
    
    mlflow.log_metric("valid_accuracy", valid_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)

    mlflow_sklearn.log_model(cat_classifier, "model")

## Training/Testing with CV

In [25]:
df_samples_ohe = pipe_ohe.fit_transform(df_samples)
train_ohe, test_ohe = train_test_split(df_samples_ohe, test_size=0.2, stratify=df_samples_ohe[target], random_state=0)

In [26]:
df_samples_cat = pipe_cat.fit_transform(df_samples)
train_cat, test_cat = train_test_split(df_samples_cat, test_size=0.2, stratify=df_samples_cat[target], random_state=0)

### Without ML-Flow

#### Random Forest

In [88]:
rf_random_search = RandomizedSearchCV(
    RandomForestClassifier(
        criterion='entropy',
        max_features=None,
        random_state=0
    ),
    param_distributions={
        "max_depth": [i+1 for i in range(5)],
        "n_estimators": [i+5 for i in range(200)]
    },
    n_iter=10,
    random_state=0,
    n_jobs=-1,
    cv=5,
    verbose=5
)
rf_random_search.fit(X=train_ohe[ohe_columns], y=train_ohe[target])

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  50 | elapsed:   51.4s remaining:  9.8min
[Parallel(n_jobs=-1)]: Done  15 out of  50 | elapsed:  1.2min remaining:  2.8min
[Parallel(n_jobs=-1)]: Done  26 out of  50 | elapsed:  1.9min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  37 out of  50 | elapsed:  2.5min remaining:   51.7s
[Parallel(n_jobs=-1)]: Done  48 out of  50 | elapsed:  3.0min remaining:    7.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.1min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'max_depth': [1, 2, 3, 4, 5], 'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204]},
          pre_dispatch='2*n_job

In [90]:
# Use validation set to modify hyperparameters
print(accuracy_score(valid_ohe[target], rf_random_search.predict(valid_ohe[ohe_columns])))
# Use testing set to evaluate final performance
print(accuracy_score(test_ohe[target], rf_random_search.predict(test_ohe[ohe_columns])))

0.9079375
0.9023


In [91]:
rf_grid_search = GridSearchCV(
    RandomForestClassifier(
        criterion='entropy',
        max_features=None,
        random_state=0
    ),
    param_grid={
        "max_depth": [i+1 for i in range(2)],
        "n_estimators": [i+200 for i in range(2)]
    },
    n_jobs=-1,
    cv=5,
    verbose=5
)
rf_grid_search.fit(X=train_ohe[ohe_columns],y=train_ohe[target])

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  20 | elapsed:   45.0s remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:  1.4min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  16 out of  20 | elapsed:  1.4min remaining:   20.7s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [1, 2], 'n_estimators': [200, 201]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [92]:
# Use validation set to modify hyperparameters
print(accuracy_score(valid_ohe[target], rf_grid_search.predict(valid_ohe[ohe_columns])))
# Use testing set to evaluate final performance
print(accuracy_score(test_ohe[target], rf_grid_search.predict(test_ohe[ohe_columns])))

0.8815625
0.8792


#### XGBoost

In [94]:
xgb_random_search = RandomizedSearchCV(
    xgboost.XGBClassifier(
        nthread=1
    ),
    param_distributions={
        "learning_rate": [0.01, 0.008, 0.005, 0.001],
        "max_depth": [i+1 for i in range(5)],
        "n_estimators": [i+5 for i in range(200)]
    },
    n_iter=10,
    random_state=0,
    n_jobs=10,
    cv=5,
    verbose=5
)
xgb_random_search.fit(X=train_ohe[ohe_columns],y=train_ohe[target])

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  42 out of  50 | elapsed:  1.9min remaining:   22.0s
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:  2.9min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid='warn', n_iter=10, n_jobs=10,
          param_distributions={'learning_rate': [0.01, 0.008, 0.005, 0.001], 'max_depth': [1, 2, 3, 4, 5], 'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,...185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204]},
          pre_dispatch='2*n_jobs', random_state=0, refit=Tru

In [95]:
# Use validation set to modify hyperparameters
print(accuracy_score(valid_ohe[target], xgb_random_search.predict(valid_ohe[ohe_columns])))
# Use testing set to evaluate final performance
print(accuracy_score(test_ohe[target], xgb_random_search.predict(test_ohe[ohe_columns])))

0.9105625
0.90535


In [None]:
xgb_grid_search = GridSearchCV(
    xgboost.XGBClassifier(
        learning_rate=0.008,
        nthread=1
    ),
    param_grid={
        "max_depth": [i+1 for i in range(2)],
        "n_estimators": [i+200 for i in range(2)]
    },
    n_jobs=10,
    cv=5,
    verbose=5
)
xgb_grid_search.fit(X=train_ohe[ohe_columns],y=train_ohe[target])

In [None]:
# Use validation set to modify hyperparameters
print(accuracy_score(valid_ohe[target], xgb_grid_search.predict(valid_ohe[ohe_columns])))
# Use testing set to evaluate final performance
print(accuracy_score(test_ohe[target], xgb_grid_search.predict(test_ohe[ohe_columns])))

#### LightGBM

In [None]:
lgb_classifier = lightgbm.LGBMClassifier(
    objective="binary",
    categorical_features="auto",
    max_depth=4,
    learning_rate=0.01,
    n_estimators=200
)

In [None]:
lgb_classifier.fit(X=train_cat, y=train_cat[target])
# Use validation set to modify hyperparameters
print(accuracy_score(valid_cat[target], lgb_classifier.predict(valid_cat)))
# Use testing set to evaluate final performance
print(accuracy_score(test_cat[target], lgb_classifier.predict(test_cat)))

#### CatBoost

In [None]:
cat_classifier = catboost.CatBoostClassifier(
    cat_features=cat_columns,
    max_depth=4,
    learning_rate=0.01,
    n_estimators=200,
    verbose=0
)

In [None]:
cat_classifier.fit(X=train_cat, y=train_cat[target])
# Use validation set to modify hyperparameters
print(accuracy_score(valid_cat[target], cat_classifier.predict(valid_cat)))
# Use testing set to evaluate final performance
print(accuracy_score(test_cat[target], cat_classifier.predict(test_cat)))

### With ML-Flow

In [69]:
mlflow.set_experiment("Training/Testing with CV")

INFO: 'Training/Testing with CV' does not exist. Creating a new experiment


#### Random Forest

In [77]:
with mlflow.start_run(run_name="Random Forest"):
    criterion = "entropy"
    max_features = None
    max_depth = [i+1 for i in range(5)]
    n_estimators = [i+5 for i in range(200)]
    
    rf_random_search = RandomizedSearchCV(
        RandomForestClassifier(
            criterion=criterion,
            max_features=max_features,
            random_state=0
        ),
        param_distributions={
            "max_depth": max_depth,
            "n_estimators": n_estimators
        },
        n_iter=10,
        random_state=0,
        n_jobs=-1,
        cv=5,
        verbose=5
    )
    rf_random_search.fit(X=train_ohe[ohe_columns],y=train_ohe[target])

    test_accuracy = accuracy_score(test_ohe[target], rf_random_search.predict(test_ohe[ohe_columns]))
    
    mlflow.log_param("search_max_depth", max_depth)
    mlflow.log_param("search_n_estimators", n_estimators)
    
    for param_key, param_value in rf_random_search.best_params_.items():
        mlflow.log_param(param_key, param_value)
      
    mlflow.log_metric("test_accuracy", test_accuracy)
    mlflow.log_metric("search_cv_score", rf_random_search.best_score_)
    
    mlflow_sklearn.log_model(rf_random_search, "model")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  50 | elapsed:   57.6s remaining: 11.0min
[Parallel(n_jobs=-1)]: Done  15 out of  50 | elapsed:  1.5min remaining:  3.5min
[Parallel(n_jobs=-1)]: Done  26 out of  50 | elapsed:  2.0min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  37 out of  50 | elapsed:  2.3min remaining:   49.3s
[Parallel(n_jobs=-1)]: Done  48 out of  50 | elapsed:  3.3min remaining:    8.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.3min finished


In [None]:
!mlflow ui --host 0.0.0.0

## Training/Testing with CV with Early Stopping

### Without ML-Flow

#### Random Forest

RF does not support early stopping

### With ML-Flow

## Nested CV

### Without ML-Flow

### With ML-Flow

## Nested CV with Early Stopping

### Without ML-Flow

### With ML-Flow