In [5]:
import sys
# sys.path.append('/Users/bernardoloureiro/template-lib')

from utils.notebookhelpers.helpers import Helpers
from utils.dtos.templateOutputCollection import TemplateOutputCollection
from utils.dtos.variable import Metadata
from utils.dtos.templateOutput import TemplateOutput
from utils.dtos.templateOutput import OutputType
from utils.dtos.templateOutput import ChartType
import datetime
import logging
import pandas as pd
import numpy as np
import os
from dateutil import parser
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
import math
import scipy as scipy
import matplotlib.pyplot as plt
import seaborn as sns
import copy
%matplotlib inline
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import ADASYN
from sklearn.ensemble import RandomForestClassifier

from collections import Counter
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
# from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV

In [None]:
inputDatasetParameter=Helpers.get_or_create_input_dataset(
    name="inputDataset6",
    metadata=Metadata(input_name='Model_Input_Data', is_required=True
                      , tooltip='Model Input data for Classification'),
    local_context=locals()
)


outputDatasetParameter=Helpers.get_or_create_output_dataset(
 name="Model_result",
    metadata=Metadata(input_name='Model Accuracy Scores', is_required=True, tooltip='Dataset name to be created after the transformation'),
    local_context=locals()
)

In [None]:
#Getting the context
contextId = 'HealtCare_Fraud_data_ML_Classification'
context = Helpers.getOrCreateContext(contextId=contextId, localVars=locals())

In [None]:
#Getting the parameters
train_iobp_df_final=inputDatasetParameter.value



outputDataset=outputDatasetParameter.value

In [None]:
train_iobp_df_final = Helpers.getEntityData(context, train_iobp_df_final)

In [2]:
def pred_prob(clf, data): 
    """
    Description :: This function is created for storing the predicted probabability using the trained model.
    
    Input :: It accepts below input parameters :
      - clf : Trained model classifier
      - data : Dataset for which we want to generate the predictions
    """
    y_pred = clf.predict_proba(data)[:,1]
    return y_pred

def draw_roc(train_fpr, train_tpr, test_fpr, test_tpr):
    """
    Description :: This function is created for calculating the AUC score on train and test data. And, plotting the ROC curve.
    
    Input :: It accepts below input parameters :
      - train_fpr : Train False +ve rate
      - train_tpr : Train True +ve rate
      - test_fpr : Test False +ve rate
      - test_tpr : Test True +ve rate
    """
    # calculate auc for train and test
    train_auc = auc(train_fpr, train_tpr)
    test_auc = auc(test_fpr, test_tpr)
    with plt.style.context('seaborn-poster'):
      plt.plot(train_fpr, train_tpr, label="Train AUC ="+"{:.4f}".format(train_auc), color='blue')
      plt.plot(test_fpr, test_tpr, label="Test AUC ="+"{:.4f}".format(test_auc), color='red')
      plt.legend()
      plt.xlabel("False Positive Rate(FPR)", fontdict=label_font_dict)
      plt.ylabel("True Positive Rate(TPR)", fontdict=label_font_dict)
      plt.title("Area Under Curve", fontdict=title_font_dict)
      plt.grid(b=True, which='major', color='lightgrey', linestyle='--')
      plt.minorticks_on()
      plt.show()
    
def find_best_threshold(threshold, fpr, tpr):
    """
    Description :: This function is created for finding the best threshold value.
    """
    t = threshold[np.argmax(tpr * (1-fpr))]
    return t

def predict_with_best_t(proba, threshold):
    """
    Description :: This function is created for generating the predictions based on the best threshold value.
    """
    predictions = []
    for i in proba:
        if i>=threshold:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions

def draw_confusion_matrix(best_t, x_train, x_test, y_train, y_test, y_train_pred, y_test_pred):
    """
    Description :: This function is created for plotting the confusion matrix of TRAIN and TEST sets.
    """
    fig, ax = plt.subplots(1,2, figsize=(20,6))

    train_prediction = predict_with_best_t(y_train_pred, best_t)
    cm = confusion_matrix(y_train, train_prediction)
    with plt.style.context('seaborn'):
        sns.heatmap(cm, annot=True, fmt='d', ax=ax[0], cmap='viridis')
        ax[0].set_title('Train Dataset Confusion Matrix', fontdict=title_font_dict)
        ax[0].set_xlabel("Predicted Label", fontdict=label_font_dict)
        ax[0].set_ylabel("Actual Label", fontdict=label_font_dict)

    test_prediction = predict_with_best_t(y_test_pred, best_t)
    cm = confusion_matrix(y_test, test_prediction)
    with plt.style.context('seaborn'):
        sns.heatmap(cm, annot=True, fmt='d', ax=ax[1], cmap='summer')
        ax[1].set_title('Test Dataset Confusion Matrix', fontdict=title_font_dict)
        ax[1].set_xlabel("Predicted Label", fontdict=label_font_dict)
        ax[1].set_ylabel("Actual Label", fontdict=label_font_dict)
    
    plt.show()
    
    return train_prediction, test_prediction

In [3]:
def validate_model(clf, x_train, x_test, y_train, y_test):
    """
    Description :: This function is created for performing the evaluation of the trained model.
    """
    # predict the probability of train data
    y_train_pred = pred_prob(clf, x_train)
    
    # predict the probability of test data
    y_test_pred = pred_prob(clf, x_test)
    
    # calculate tpr, fpr using roc_curve
    train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
    test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)
    
    # calculate auc for train and test
    train_auc = auc(train_fpr, train_tpr)
    print("### Train AUC = {}".format(train_auc))
    test_auc = auc(test_fpr, test_tpr)
    print("### Test AUC = {}".format(test_auc))
    
    # plotting the ROC curve
    draw_roc(train_fpr, train_tpr, test_fpr, test_tpr)
    
    # Best threshold value
    best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
    
    # Plotting the confusion matrices
    train_prediction, test_prediction = draw_confusion_matrix(best_t, x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
    
    # Generating the F1-scores
    train_f1_score = f1_score(y_train, train_prediction)
    test_f1_score = f1_score(y_test, test_prediction)
    
    return test_auc, train_f1_score, test_f1_score, best_t

In [None]:
def RF_classifier(train_iobp_df_final):
    train_iobp_df_final = train_iobp_df_final.groupby(['Provider','PotentialFraud'],as_index=False).agg('sum')
    X = train_iobp_df_final.drop(axis=1, columns=['Provider','PotentialFraud'])
    y = train_iobp_df_final['PotentialFraud']
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.20, stratify=y, random_state=39)
#     X_train, X_test, y_train, y_test = tts(X, y, test_size=0.25, stratify=y, random_state=39)
    
    # Standardize the data (train and test)
    robust_scaler = RobustScaler()
    robust_scaler.fit(X_train)
    X_train_std = robust_scaler.transform(X_train)
    X_test_std = robust_scaler.transform(X_test)
    
    # Standardize the data (train and test)
    robust_scaler = RobustScaler()
    robust_scaler.fit(X_train)
    X_train_std = robust_scaler.transform(X_train)
    X_test_std = robust_scaler.transform(X_test)
    
    # Performing minority oversampling
    oversample = ADASYN(sampling_strategy=0.45, n_neighbors=8)
    X_train_ovsamp, y_train_ovsamp = oversample.fit_resample(X_train_std, y_train)
    
    return X_train_ovsamp, y_train_ovsamp, X_test_std,y_test


X_train_ovsamp, y_train_ovsamp, X_test_std,y_test = RF_classifier(train_iobp_df)


# counter = Counter(y_train_ovsamp)

# fraud_percentage = (counter[1]*100 / (counter[0]+counter[1]))
# non_fraud_percentage = (counter[0]*100 / (counter[0]+counter[1]))
# print("Fraud Percentage = {:.2f}% and Non-Fraud Percentage = {:.2f}%".format(fraud_percentage, non_fraud_percentage))


# Training the model with all features and hyper-parameterized values
rfc = RandomForestClassifier(n_estimators=70,criterion='gini',
                                   max_depth= 4,
                                   max_features='auto',
                                   min_samples_leaf=30,
                                   min_samples_split=30,
                                   random_state=47,
                                   min_weight_fraction_leaf=0.0,
                                   max_leaf_nodes=None,
                                   min_impurity_decrease=0.0,
                                   ccp_alpha=0.0)

rfc.fit(X_train_ovsamp, y_train_ovsamp)


# Validate model
test_auc, train_f1_score, test_f1_score, best_t = validate_model(rfc, X_train_ovsamp, X_test_std, y_train_ovsamp, y_test)

print("\n")
print("### Best Threshold = {:.4f}".format(best_t))
print("### Model AUC is : {:.4f}".format(test_auc))
print("### Model Train F1 Score is : {:.4f}".format(train_f1_score))
print("### Model Test F1 Score is : {:.4f}".format(test_f1_score))

results = pd.DataFrame(zip([test_auc, train_f1_score, test_f1_score, best_t]),index=['Best Threshold','Best model AUC',
                                                                           'Model Train_F1 Score','Model Test_F1 Score'],
                                                                             columns=['Results'])

# feats_imps = pd.DataFrame({'Features': X_train.columns, 'Importance_Model_1': rfc.feature_importances_})
# feats_imps = feats_imps[feats_imps['Importance_Model_1'] != 0]
# feats_imps.reset_index(drop=True, inplace=True)
# feats_imps.head()

# top_20_pos_feats = feats_imps.sort_values(by='Importance_Model_1',axis=0,ascending=False)['Features'].iloc[0:20]
# top_20_pos_feats_scores = feats_imps.sort_values(by='Importance_Model_1',axis=0,ascending=False)['Importance_Model_1'].iloc[0:20]


# with plt.style.context('seaborn-poster'):
#     sns.barplot(y=top_20_pos_feats, x=top_20_pos_feats_scores, orient='h', palette='coolwarm')
#     plt.xlabel("\nFeatures Importance", fontdict=label_font_dict)
#     plt.ylabel("Features\n", fontdict=label_font_dict)
#     plt.title("Top 15 Importance Positive Features\n", fontdict=title_font_dict)


In [None]:
outputCollection = Helpers.createOutputCollection(context)
out = Helpers.createTemplateOutputDataset(context=context, outputName=outputDataset, dataFrame=results)
outputCollection.addTemplateOutput(out)
Helpers.save(context)