In [5]:
import pandas as pd

import mlflow
from mlflow import sklearn as mlflow_sklearn

import sklearn.datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
import catboost
import lightgbm as lgb
import xgboost

In [7]:
import numpy as np
import six
from collections import defaultdict
from scipy import sparse

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals.joblib import Parallel, delayed
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline, FeatureUnion, _fit_transform_one, _transform_one

import category_encoders as ce

## Import Data
### Data Set Exploration

In [51]:
df = pd.read_csv('heart.csv')
print(df.head(3))
print('')
# Target bias
print('Data bias')
target_sum = df['target'].count()
target_count = df['target'].value_counts()
percent_pos = round(target_count[0] / target_sum *100,1)
percent_neg = 100 - percent_pos
print('Total number targets is {}, with {}% positve and {}% negitive'.
      format(target_sum,str(percent_pos), str(percent_neg) ))

# Gender bias
gender_sum = df['sex'].count()
gender_count = df['sex'].value_counts()
percent_male = round(gender_count[1] / gender_sum *100,1)
percent_female = round(100 - percent_male,1)
print('Of the {} participents, {}% are Male and {}% are Female'.
      format(gender_sum,str(percent_male), str(percent_female) ))

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  

Data bias
Total number targets is 303, with 45.5% positve and 54.5% negitive
Of the 303 participents, 68.3% are Male and 31.7% are Female


In [52]:
age_data = df['age']


In [5]:
class PandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(trans, X, y, weight,
                                        **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans, X, None, weight)
            for name, trans, weight in self._iter())
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

In [6]:
def _name_estimators(estimators):
    """Generate names for estimators."""

    names = [type(estimator).__name__.lower() for estimator in estimators]
    namecount = defaultdict(int)
    for est, name in zip(estimators, names):
        namecount[name] += 1

    for k, v in list(six.iteritems(namecount)):
        if v == 1:
            del namecount[k]

    for i in reversed(range(len(estimators))):
        name = names[i]
        if name in namecount:
            names[i] += "-%d" % namecount[name]
            namecount[name] -= 1

    return list(zip(names, estimators))

In [7]:
def make_pandas_union(*transformers, **kwargs):
    n_jobs = kwargs.pop('n_jobs', None)
    if kwargs:
        raise TypeError('Unknown keyword arguments: "{}"'
                        .format(list(kwargs.keys())[0]))
    return PandasFeatureUnion(_name_estimators(transformers), n_jobs=n_jobs)

In [8]:
class OrdinalEncoderPandas(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns
        self.transformers = {}
    
    def fit(self, X, y=None):
        for column in self.columns:
            self.transformers[column] = ce.OrdinalEncoder(return_df=False, handle_unknown="impute").fit(X[[column]])
        return self
    
    def transform(self, X, y=None):
        X = X.drop(list(set(X.columns) - set(self.columns)), axis=1)
        for column in self.columns:
            X[column] = self.transformers[column].transform(X[[column]])
            X[column] = X[column].apply(lambda x: x if x else -1)
        return X

In [9]:
class OneHotEncoderPandas(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns
        self.transformers = {}
        self.feature_names = {}
        self.feature_names_all = []

    def fit(self, X, y=None):
        for column in self.columns:
            self.transformers[column] = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(X[[column]])
            features = [f"{column}_{i}" for i in self.transformers[column].get_feature_names()]
            self.feature_names[column] = features
            for feature in features:
                self.feature_names_all.append(feature)
        return self

    def transform(self, X, y=None):
        ohe_df_list = []
        for column in self.columns:
            ohe_df = pd.DataFrame(self.transformers[column].transform(X[[column]]))
            feature_names = [f"{column}_{i}" for i in self.transformers[column].get_feature_names()]
            ohe_df.columns = self.feature_names[column]
            ohe_df_list.append(ohe_df)
        ohe_df_concat = pd.concat(ohe_df_list, axis=1)
        return ohe_df_concat

In [10]:
class DropColumn(TransformerMixin, BaseEstimator):
    def __init__(self, columns, no_drops):
        self.columns = columns
        self.no_drops = no_drops

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for column in self.columns:
            if column in X.columns:
                drop_together = False
                if self.no_drops:
                    for no_drop in self.no_drops:
                        if column == no_drop and self.no_drops[no_drop] not in X.columns:
                            drop_together = True
                if not drop_together:
                    X = X.drop(columns=column)
            else:
                print(f"Drop Warning: Column {column} not in X")
        return X

In [11]:
class ChangeColumnType(TransformerMixin, BaseEstimator):
    def __init__(self, types):
        self.types = types

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for column in self.types.keys():
            if column in X.columns:
                X[column] = X[column].astype(self.types[column])
            else:
                print(f"Change Warning: Column {column} not in X")
        return X

## Notes:
- XGBoost/Random Forest requires OHE of categorical variables unless categorial variable is ordinal
- XGBoost/LightGBM/Catboost supports missing variables, but RandomForests do not (most models in sklearn does not support missing values)

### Synthetic dataset with features 0 to 3 (4 feature in total) categorical type

In [12]:
samples = sklearn.datasets.make_classification(n_samples=100000, scale=10, random_state=0)

In [13]:
feature_names = [f"feature_{i}" for i in range(samples[0].shape[1])]
target = "target"

In [14]:
df_samples = pd.concat([pd.DataFrame(samples[0], columns=feature_names), pd.DataFrame(samples[1], columns=[target])], axis=1)

In [15]:
categorical_features = list(df_samples.columns[0:4])

In [16]:
for i in categorical_features:
    df_samples[i] = abs(df_samples[i]).astype(int).astype(str)

In [17]:
df_samples.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,12,4,10,2,0.645832,-23.556619,14.920084,-4.116929,-9.131104,2.125107,...,2.361506,-0.189242,5.739019,-2.016198,16.778215,9.035596,-7.726635,-3.518649,16.847575,0
1,8,4,5,8,-0.972505,6.832732,-8.335915,8.297369,7.750135,18.976983,...,23.377719,-18.228802,-2.224554,-0.25129,-7.388667,0.141325,-4.956575,1.528967,10.877658,1
2,4,2,1,2,20.891787,-4.344791,2.434333,-15.868354,7.16975,1.856554,...,-8.959649,-0.403657,3.37873,4.083821,-10.070729,1.227475,-14.536854,-4.519916,-3.603694,1
3,20,3,1,16,-4.322641,-7.138765,2.78531,7.352866,-23.127705,3.468762,...,7.242841,14.721356,-5.952227,-4.674254,-2.79513,-4.940003,12.700518,-5.517182,-2.99683,1
4,8,5,1,7,3.140241,11.670064,11.100301,2.774834,0.150274,-11.49054,...,-4.888,-3.621434,4.478216,3.600383,-11.334051,-2.805697,14.555254,-2.763186,3.916138,1


In [18]:
df_samples[target].value_counts()

1    50050
0    49950
Name: target, dtype: int64

### Pipeline for Models that does not support categorical variables

Need to use OHE, only when you know the data is not ordinal, otherwise you can use ordinal encoding

XGBoost, RandomForest (CART can handle categorical, but RF does not have this implemented)

In [19]:
pipe_ohe = make_pipeline(
    make_pandas_union(
        DropColumn(columns=categorical_features, no_drops=None),
        make_pipeline(
            ChangeColumnType(types={i: str for i in categorical_features}),
            OneHotEncoderPandas(columns=categorical_features)
        )
    )
)

In [20]:
pipe_ohe.fit_transform(df_samples).head(5)

Unnamed: 0,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,...,feature_3_x0_40,feature_3_x0_41,feature_3_x0_42,feature_3_x0_43,feature_3_x0_44,feature_3_x0_5,feature_3_x0_6,feature_3_x0_7,feature_3_x0_8,feature_3_x0_9
0,0.645832,-23.556619,14.920084,-4.116929,-9.131104,2.125107,-5.604669,2.361506,-0.189242,5.739019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.972505,6.832732,-8.335915,8.297369,7.750135,18.976983,-0.398073,23.377719,-18.228802,-2.224554,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,20.891787,-4.344791,2.434333,-15.868354,7.16975,1.856554,13.228155,-8.959649,-0.403657,3.37873,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-4.322641,-7.138765,2.78531,7.352866,-23.127705,3.468762,-10.794777,7.242841,14.721356,-5.952227,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.140241,11.670064,11.100301,2.774834,0.150274,-11.49054,6.220968,-4.888,-3.621434,4.478216,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [22]:
ohe_columns = list(set(pipe_ohe.fit_transform(df_samples).columns) - set([target]))

### Pipeline for models that support categorical variables

LightGBM, CatBoost can handle categorical variables

In [24]:
pipe_cat = make_pipeline(
    make_pandas_union(
        DropColumn(columns=categorical_features, no_drops=None),
        OrdinalEncoderPandas(columns=categorical_features)
    ),
    ChangeColumnType(types={i: "category" for i in categorical_features}),
)

In [25]:
pipe_cat.fit_transform(df_samples).head(5)

Unnamed: 0,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,...,feature_15,feature_16,feature_17,feature_18,feature_19,target,feature_0,feature_1,feature_2,feature_3
0,0.645832,-23.556619,14.920084,-4.116929,-9.131104,2.125107,-5.604669,2.361506,-0.189242,5.739019,...,16.778215,9.035596,-7.726635,-3.518649,16.847575,0,1,1,1,1
1,-0.972505,6.832732,-8.335915,8.297369,7.750135,18.976983,-0.398073,23.377719,-18.228802,-2.224554,...,-7.388667,0.141325,-4.956575,1.528967,10.877658,1,2,1,2,2
2,20.891787,-4.344791,2.434333,-15.868354,7.16975,1.856554,13.228155,-8.959649,-0.403657,3.37873,...,-10.070729,1.227475,-14.536854,-4.519916,-3.603694,1,3,2,3,1
3,-4.322641,-7.138765,2.78531,7.352866,-23.127705,3.468762,-10.794777,7.242841,14.721356,-5.952227,...,-2.79513,-4.940003,12.700518,-5.517182,-2.99683,1,4,3,3,3
4,3.140241,11.670064,11.100301,2.774834,0.150274,-11.49054,6.220968,-4.888,-3.621434,4.478216,...,-11.334051,-2.805697,14.555254,-2.763186,3.916138,1,2,4,3,4


## Training/Testing/Validation

In [26]:
df_samples_ohe = pipe_ohe.fit_transform(df_samples)
train_ohe, test_ohe = train_test_split(df_samples_ohe, test_size=0.2, stratify=df_samples_ohe[target], random_state=0)
train_ohe, valid_ohe = train_test_split(train_ohe, test_size=0.2, stratify=train_ohe[target], random_state=0)

In [30]:
df_samples_cat = pipe_cat.fit_transform(df_samples)
train_cat, test_cat = train_test_split(df_samples_cat, test_size=0.2, stratify=df_samples_cat[target], random_state=0)
train_cat, valid_cat = train_test_split(train_cat, test_size=0.2, stratify=train_cat[target], random_state=0)

### Without ML-Flow

In [31]:
rf_classifier = RandomForestClassifier(
    criterion='entropy',
    max_features=None,
    n_estimators=20,
    max_depth=4,
    random_state=0,
    n_jobs=4)

In [32]:
rf_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target])
# Use validation set to modify hyperparameters
print(accuracy_score(valid_ohe[target], rf_classifier.predict(valid_ohe[ohe_columns])))
# Use testing set to evaluate final performance
print(accuracy_score(test_ohe[target], rf_classifier.predict(test_ohe[ohe_columns])))

0.903625
0.89655


In [33]:
xgb_classifier = xgboost.XGBClassifier(
    max_depth=4,
    learning_rate=0.008,
    n_estimators=200
)

In [34]:
xgb_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target])
# Use validation set to modify hyperparameters
print(accuracy_score(valid_ohe[target], xgb_classifier.predict(valid_ohe[ohe_columns])))
# Use testing set to evaluate final performance
print(accuracy_score(test_ohe[target], xgb_classifier.predict(test_ohe[ohe_columns])))

0.908625
0.90415


In [35]:
lgb_classifier = lightgbm.LGBMClassifier(
    objective="binary",
    categorical_features="auto",
    max_depth=4,
    learning_rate=0.01,
    n_estimators=200
)

In [36]:
lgb_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target])
# Use validation set to modify hyperparameters
print(accuracy_score(valid_ohe[target], lgb_classifier.predict(valid_ohe[ohe_columns])))
# Use testing set to evaluate final performance
print(accuracy_score(test_ohe[target], lgb_classifier.predict(test_ohe[ohe_columns])))

0.9096875
0.90585


In [37]:
cat_classifier = catboost.CatBoostClassifier(
    max_depth=4,
    learning_rate=0.01,
    n_estimators=200,
    verbose=0
)

In [38]:
cat_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target])
# Use validation set to modify hyperparameters
print(accuracy_score(valid_ohe[target], cat_classifier.predict(valid_ohe[ohe_columns])))
# Use testing set to evaluate final performance
print(accuracy_score(test_ohe[target], cat_classifier.predict(test_ohe[ohe_columns])))

0.902125
0.90025


### With ML-Flow

In [39]:
mlflow.set_experiment("Training/Testing/Validation")

In [40]:
with mlflow.start_run(run_name="Random Forest"):
    criterion = "entropy"
    max_features = None
    n_estimators = 20
    max_depth = 4
    
    rf_classifier = RandomForestClassifier(
        criterion=criterion,
        max_features=max_features,
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=0,
        n_jobs=4)
    rf_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target])
    
    valid_accuracy = accuracy_score(valid_ohe[target], rf_classifier.predict(valid_ohe[ohe_columns]))
    test_accuracy = accuracy_score(test_ohe[target], rf_classifier.predict(test_ohe[ohe_columns]))
    
    mlflow.log_param("criterion", criterion)
    mlflow.log_param("max_features", max_features)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    
    mlflow.log_metric("valid_accuracy", valid_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)

    mlflow_sklearn.log_model(rf_classifier, "model")

In [41]:
with mlflow.start_run(run_name="XGBoost"):
    n_estimators = 200
    max_depth = 4
    learning_rate = 0.008
    
    xgb_classifier = xgboost.XGBClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators
    )
    xgb_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target])
    
    valid_accuracy = accuracy_score(valid_ohe[target], xgb_classifier.predict(valid_ohe[ohe_columns]))
    test_accuracy = accuracy_score(test_ohe[target], xgb_classifier.predict(test_ohe[ohe_columns]))
    
    mlflow.log_param("criterion", criterion)
    mlflow.log_param("max_features", max_features)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    
    mlflow.log_metric("valid_accuracy", valid_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)

    mlflow_sklearn.log_model(xgb_classifier, "model")

In [42]:
with mlflow.start_run(run_name="LightGBM"):
    objective = "binary"
    categorical_features = "auto"
    n_estimators = 200
    max_depth = 4
    learning_rate = 0.01
    
    lgb_classifier = lightgbm.LGBMClassifier(
        objective=objective,
        categorical_features=categorical_features,
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators
    )
    lgb_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target])
    
    valid_accuracy = accuracy_score(valid_ohe[target], lgb_classifier.predict(valid_ohe[ohe_columns]))
    test_accuracy = accuracy_score(test_ohe[target], lgb_classifier.predict(test_ohe[ohe_columns]))
    
    mlflow.log_param("criterion", criterion)
    mlflow.log_param("max_features", max_features)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    
    mlflow.log_metric("valid_accuracy", valid_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)

    mlflow_sklearn.log_model(lgb_classifier, "model")

In [43]:
with mlflow.start_run(run_name="CatBoost"):
    n_estimators = 200
    max_depth = 4
    learning_rate = 0.01
    
    cat_classifier = catboost.CatBoostClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        verbose=0
    )
    cat_classifier.fit(X=train_ohe[ohe_columns], y=train_ohe[target])
    
    valid_accuracy = accuracy_score(valid_ohe[target], cat_classifier.predict(valid_ohe[ohe_columns]))
    test_accuracy = accuracy_score(test_ohe[target], cat_classifier.predict(test_ohe[ohe_columns]))
    
    mlflow.log_param("criterion", criterion)
    mlflow.log_param("max_features", max_features)
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    
    mlflow.log_metric("valid_accuracy", valid_accuracy)
    mlflow.log_metric("test_accuracy", test_accuracy)

    mlflow_sklearn.log_model(cat_classifier, "model")

In [None]:
!mlflow ui --host 0.0.0.0

[2019-03-28 20:06:01 -0600] [4664] [INFO] Starting gunicorn 19.9.0
[2019-03-28 20:06:01 -0600] [4664] [INFO] Listening at: http://0.0.0.0:5000 (4664)
[2019-03-28 20:06:01 -0600] [4664] [INFO] Using worker: sync
[2019-03-28 20:06:01 -0600] [4667] [INFO] Booting worker with pid: 4667


## Training/Testing/Validation with Early Stopping

### Without ML-Flow

### With ML-Flow

## Training/Testing with CV

### Without ML-Flow

### With ML-Flow

## Training/Testing with CV with Early Stopping

### Without ML-Flow

### With ML-Flow

## Nested CV

### Without ML-Flow

### With ML-Flow

## Nested CV with Early Stopping

### Without ML-Flow

### With ML-Flow