# Model creation, fitting and saving

In [1]:
import time
from contextlib import contextmanager

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

In [2]:
from imblearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import lightgbm as lgb
from sklearn.metrics import confusion_matrix

def evalerror(_y_valid, _y_pred):
    TN, FP, FN, TP = confusion_matrix(
        list(_y_valid), list(_y_pred), labels=[0, 1]).ravel()
    # Sensitivity, hit rate, recall, or true positive rate
    sensitivity = TP/(TP+FN)
    # Overall accuracy
    accuracy = (TP+TN)/(TP+FP+FN+TN)
    value = sensitivity*accuracy
    return "error", value, True


def fit_classifier(_X_train, _y_train):

    # Initializing the classifier
    classifier = lgb.LGBMClassifier(
        boosting_type='gbdt', objective='binary', max_depth=18,
        n_jobs=-1, num_leaves=30, learning_rate=0.02, n_estimators=1600,
        max_bin=512, subsample_for_bin=200, subsample=0.8,
        subsample_freq=1, colsample_bytree=0.8,
        reg_alpha=80, reg_lambda=20,
        min_split_gain=0.5, min_child_weight=1,
        min_child_samples=10, scale_pos_weight=11.5, num_class=1)

    # Create pipeline to predict classification from data
    pipeline = make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler(),
        SMOTE(sampling_strategy=0.2, k_neighbors=10),
        classifier
    )

    # Fit the pipeline
    pipeline.fit(_X_train, _y_train, lgbmclassifier__eval_metric=evalerror)

    return pipeline

# SHAP Analysis to know features importance for streamlit dashboard

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

# number of x most important features to show
top_x = 10


def get_features_importances(_pipeline, _X_train, _X_test):
    color_list =  sns.color_palette("dark", len(_X_train.columns)) 
    
    feature_importance = _pipeline[3].feature_importances_
    indices = np.argsort(feature_importance)
    indices = indices[-top_x:]
    
    print(indices)

    fig, axs = plt.subplots(1, 1, figsize=(
        24, 8), facecolor='w', edgecolor='k')

    bars = axs.barh(range(len(indices)),
                       feature_importance[indices], color='b', align='center')
    axs.set_title('LGBM', fontweight="normal", fontsize=16)

    plt.sca(axs)
    plt.yticks(range(len(indices)), [_X_train.columns[j]
               for j in indices], fontweight="normal", fontsize=16)

    # print(len(plt.gca().get_yticklabels()), len(indices))
    for i, ticklabel in enumerate(plt.gca().get_yticklabels()):
        ticklabel.set_color(color_list[indices[i]])

    for i, bar in enumerate(bars):
        bar.set_color(color_list[indices[i]])
        plt.box(False)
        
    return _X_train.columns[indices]

In [4]:
def get_lime_features_importances(_pipeline, _X_train, _X_test):
    top_x = 15
    
    # Create pipeline to transform data
    pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('scaler', StandardScaler())])
    X_pre = pd.DataFrame(pipeline.fit_transform(X), columns=X.columns, index=X.index)
    
    # Create lime explainer based on loaded data
    lime_explainer = LimeTabularExplainer(X_pre,
                                          feature_names=list(X_pre.columns),
                                          class_names=set_y,
                                          discretize_continuous=False,
                                          kernel_width=np.sqrt(len(X_pre.columns)) * 0.75,
                                          mode="classification")

    lime_explainer.save_to_file('MLflow/lime_explainer.html')
    
    # explain first sample from test data
    explanation = lime_explainer.explain_instance(
        X_pre.loc[idx], classifier.predict_proba, num_features=top_x)

    # Get features importance for top_x most important features
    features_importance = {}
    for feat_index, ex in explanation.as_map()[1]:
        features_importance[list(X_pre.columns)[feat_index]] = ex
        
    return features_importance

# MAIN

In [5]:
data_file = 'data/global_train_data.pkl'
pipeline_file = 'pipeline_home_credit.joblib'

In [16]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split

def main(features=True, debug=False):
    
    with timer("Load and split data"):
        data = pd.read_pickle(data_file).sample(
            50000) if debug else pd.read_pickle(data_file)

        # Get X and y value from data
        X = data.drop(['TARGET', 'SK_ID_CURR'], axis=1)
        y = data['TARGET']

        # Create train, test data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, stratify=y, test_size=0.25, random_state=1)
    
    with timer("Process and save classifier pipeline"):
        # Process and save pipeline
        pipeline = fit_classifier(X_train, y_train)
        joblib.dump(pipeline, pipeline_file)
        # Get score for predicted value
        print('score:', evalerror(y_test, pipeline.predict(X_test))[1])

if __name__ == "__main__":
    with timer("Full model run"):
        main(debug=True)

Load and split data - done in 4s
score: 0.4301348118811882
Process and save classifier pipeline - done in 35s


 # SHAP

In [5]:
import pandas as pd
import joblib

data_file = 'data/global_train_data.pkl'
pipeline_file = 'pipeline_home_credit.joblib'

with timer("Load and preprocess"):
    # Read data
    data = pd.read_pickle(data_file)

    # Get X and y value from data
    X = data.drop(['TARGET', 'SK_ID_CURR'], axis=1)

    # Load pipeline
    pipeline = joblib.load(pipeline_file)

    # Get pipeline SimpleImputer and StandardScaler
    imputer = pipeline[0]
    scaler = pipeline[1]

    # Transform X to have data for input of classifier
    impute_data = imputer.transform(X)
    scaled_data = scaler.transform(impute_data)
    scaled_data_df = pd.DataFrame(scaled_data, columns=X.columns, index=data['SK_ID_CURR'])

    # Get lgbm classifier
    clf = pipeline[3]

Load and preprocess - done in 14s


In [9]:
import shap

curr_idx = 100002

#SHAP explainer values (NumPy array)
explainer = shap.TreeExplainer(clf, scaled_data_df)
shap_values = explainer.shap_values(scaled_data_df.loc[curr_idx,:], check_additivity=False)

shap_values_df = pd.DataFrame(shap_values, index=scaled_data_df.columns, columns=['feature'])
shap_sorted_features = list(shap_values_df.abs().sort_values(by='feature', ascending=False).index)


# Write shap explainer
ex_filename = 'data/explainer.bz2'
joblib.dump(explainer, filename=ex_filename, compress=('bz2', 9))
            
# plot the SHAP values for the output of the first instance
shap.initjs()
shap.force_plot(explainer.expected_value, 
                shap_values, 
                scaled_data_df.loc[curr_idx,:])

In [None]:
# the waterfall_plot shows how we get from shap_values.base_values to model.predict(X)[sample_ind]
shap.plots._waterfall.waterfall_legacy(explainer.expected_value, 
                                       shap_values, 
                                       scaled_data_df.iloc[10,:])

In [None]:
files_dict = {
    'AT': 'data/application_train.csv',
    'BU': 'data/bureau.csv',
    'BB': 'data/bureau_balance.csv',
    'PA': 'data/previous_application.csv',
    'PCB': 'data/POS_CASH_balance.csv',
    'IP': 'data/installments_payments.csv',
    'CCB': 'data/credit_card_balance.csv'
}

dfs_dict = {}
for name in files_dict.items():
    dfs_dict[name[0]] = pd.read_csv(name[1])

In [None]:
columns_description = pd.read_csv(
    'data/HomeCredit_columns_description_improved.csv',
    delimiter=';', index_col=[0])

features_df = pd.DataFrame(index=data.columns, columns=['TableKey', 'Row', 'State'])
for col in data.columns:
    for df in dfs_dict.items():
        for col2 in df[1].columns:
            if col2 in col:
                features_df.loc[col, 'TableKey'] = df[0]
                features_df.loc[col, 'Row'] = col2

for row in features_df.iterrows():
    if pd.isnull(row[1][0]):
        continue
    description_row = columns_description.loc[
        (columns_description['TableKey'] == row[1][0]) & (columns_description['Row'] == row[1][1])]
    if len(description_row):
        features_df.loc[row[0], 'State'] = description_row['State'].iloc[0]
        
features_df

# Compare dataframe

## Load global df data and data from dashboard and compare them

In [None]:
import pandas as pd
import joblib

# Get global df data
global_train_data = pd.read_pickle(data_file)
global_train_data = global_train_data.drop(['TARGET', 'SK_ID_CURR'], axis=1)
data_from_global = pd.DataFrame([global_train_data.loc[0,:]], columns=global_train_data.columns)

# Get data from dashboard
data_from_dashboard = pd.read_csv('data/data_for_classifier.csv')
data_from_dashboard = data_from_dashboard.drop(['TARGET', 'SK_ID_CURR'], axis=1)

# Compare them
data_from_global.compare(data_from_dashboard).to_csv('data/compare_dataframe.csv')

# Create model with MLflow

In [None]:
from mlflow.models.signature import infer_signature
import mlflow.sklearn

data = pd.read_pickle(data_file)
pipeline = joblib.load(pipeline_file)

# Get X and y value from data
X = data.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y = data['TARGET']

# Extract signature from in-/out- data
signature = infer_signature(X.astype('float64'), y.astype('float64'))

# Save pipeline with signature
mlflow.sklearn.save_model(pipeline, 'MLflow/mlflow_model', signature=signature)

In [None]:
import mlflow.pyfunc
from mlflow.models.signature import infer_signature
import mlflow.sklearn

class Pipeline_custom(mlflow.pyfunc.PythonModel):
    
    def __init__(self, model):
        self.model = model
        
    def predict(self, model_input):
        return self.model.predict_proba(model_input)

data = pd.read_pickle(data_file)
# Get X and y value from data
X = data.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y = data['TARGET']

#pipeline = mlflow.sklearn.load_model('MLflow/mlflow_model/')
pipeline = joblib.load(pipeline_file)
pipeline_custom = Pipeline_custom(pipeline)

# Get X and y value from data
X = data.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y = data['TARGET']

# Extract signature from in-/out- data
signature = infer_signature(X.astype('float64'), pipeline_custom.predict(X))

# Save pipeline with signature
mlflow.sklearn.save_model(pipeline_custom, 'MLflow/mlflow_model_custom', signature=signature)

In [None]:
import mlflow

data_from_dashboard = pd.read_csv('data/data_for_classifier.csv')
data_from_dashboard = data_from_dashboard.drop(['TARGET', 'SK_ID_CURR'], axis=1)

global_train_data = pd.read_pickle('data/global_train_data.pkl')
global_train_data = global_train_data.drop(['TARGET', 'SK_ID_CURR'], axis=1)
data_from_global = pd.DataFrame([global_train_data.loc[0,:]], columns=global_train_data.columns)

mlflow_model_custom = mlflow.sklearn.load_model('MLflow/mlflow_model_custom/')
print(mlflow_model_custom.predict(data_from_dashboard))
print(mlflow_model_custom.predict(data_from_global))

In [None]:
# Command to launch server for API REST for terminal
#mlflow models serve -m mlflow_model/

# Some tests

In [None]:
import pandas as pd
import re
import datetime

data_file = '../data/global_train_data.pkl'

def get_random_row():
    # Read data
    data = pd.read_pickle(data_file).sample(500)
    data = data.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
    data = data.replace([np.inf, -np.inf], np.nan)
    # Get X and y value from data
    X = data.drop(['TARGET', 'SK_ID_CURR'], axis=1)
    
    return X.sample()

def main():
        
    row = get_random_row()
    print(-row['DAYS_BIRTH'].values[0])
    print(datetime.date.today() + datetime.timedelta(days=int(row['DAYS_BIRTH'].values[0])))
    
if __name__ == '__main__':
    main()