In [1]:
import numpy as np
import pandas as pd

from numpy import mean, std, cov, sqrt, log, sum
from numpy.random import seed

from datetime import date, timedelta, datetime

from scipy.stats import pearsonr,spearmanr, boxcox
from scipy import stats

from random import sample

from apyori import apriori
from mlxtend.frequent_patterns import apriori

from matplotlib import rcParams
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec
import seaborn as sns
from matplotlib_venn import venn2
import matplotlib.gridspec as gridspec
from matplotlib.lines import Line2D

import matplotlib.style as style

from pmdarima.arima.utils import ndiffs
from statsmodels.tsa.arima_model import ARIMA
import pmdarima as pm

from statsmodels.tsa.stattools import acf, pacf
import statsmodels.api as sm

from statsmodels.formula.api import ols


import warnings
warnings.filterwarnings('ignore')

import scripts as src

from IPython.display import HTML, display

# style.use('fivethirtyeight')

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 1000)

rcParams['figure.figsize'] = (6, 3)

rand_state=1000

In [2]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import catboost as ctb
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, auc

import tensorflow.keras
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping

## Import Data

In [None]:
## import data
raw_beneficiary=pd.read_csv('data/Train_Beneficiarydata-1542865627584.csv')
raw_inpatient=pd.read_csv('data/Train_Inpatientdata-1542865627584.csv')
raw_outpatient=pd.read_csv('data/Train_Outpatientdata-1542865627584.csv')
raw_train=pd.read_csv('data/Train-1542865627584.csv')

## merge data
inpatient_beneficiary=pd.merge(raw_inpatient,raw_beneficiary,on='BeneID')
outpatient_beneficiary=pd.merge(raw_outpatient,raw_beneficiary,on='BeneID')

outpatient_beneficiary['Category']='Outpatient'
inpatient_beneficiary['Category']='Inpatient'

raw=pd.concat([outpatient_beneficiary, inpatient_beneficiary], axis = 0)
raw=pd.merge(raw,raw_train,on='Provider', how='left')

In [None]:
y = raw['PotentialFraud']
X = raw.drop(['PotentialFraud'], axis=1) # becareful inplace= False

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rand_state)
df_train=pd.concat([X_train, y_train], axis=1)
df_test=pd.concat([X_test, y_test], axis=1)
print('train data size:',df_train.shape,
     '\ntrain data size:',df_test.shape,
     '\ntotal:',len(df_train)+len(df_test))

## Feature Engineering

In [None]:
def preprocessing(df_raw):
    ## Ages
    df_raw[['ClaimStartDt', 'ClaimEndDt', 'DischargeDt', 'AdmissionDt', 'DOB', 'DOD']]= \
            df_raw[['ClaimStartDt', 'ClaimEndDt', 'DischargeDt', 'AdmissionDt', 'DOB', 'DOD']].apply(pd.to_datetime, format='%Y-%m-%d')

    df_raw['IsDead']=np.where(df_raw['DOD'].isna(), 'No', 'Yes')

    df_raw['AdmissionDt']=df_raw['AdmissionDt'].fillna(df_raw['ClaimStartDt'])

    df_raw['Age']=round((pd.to_datetime(df_raw['AdmissionDt'])-pd.to_datetime(df_raw['DOB'])).dt.total_seconds() / (24 * 60 * 60 * 365),1)

    df_raw['Age_group'] = pd.cut(df_raw['Age'],
                         bins=[0,25, 35, 45,55,65,75,85,90,100,120],
                         labels=['<25','25-35','35-45','45-55','55-65','65-75','75-85','85-90','95-100','>100'],
                         right=False)

    df=df_raw[['Provider','Age']].groupby(['Provider'])['Age'].mean().reset_index().rename(columns={'Age':'AverageAge'})
    df_raw=pd.merge(df_raw, df, on='Provider', how='left')

    ## RenalDiseaseIndicator
    df_raw['RenalDiseaseIndicator']=np.where((df_raw['RenalDiseaseIndicator'])=='0', 0, 1)

    ## Gender 
    df_raw = df_raw.replace({'Gender': 2}, 'Female')
    df_raw = df_raw.replace({'Gender': 1}, 'Male')

    ## Chronic Features
    df_raw = df_raw.replace({'ChronicCond_Alzheimer': 2, 'ChronicCond_Heartfailure': 2, 'ChronicCond_KidneyDisease': 2,
                       'ChronicCond_Cancer': 2, 'ChronicCond_ObstrPulmonary': 2, 'ChronicCond_Depression': 2, 
                       'ChronicCond_Diabetes': 2, 'ChronicCond_IschemicHeart': 2, 'ChronicCond_Osteoporasis': 2, 
                       'ChronicCond_rheumatoidarthritis': 2, 'ChronicCond_stroke': 2 }, 0)

    chronic_list=['ChronicCond_Alzheimer', 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease', 
                  'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression', 
                  'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
                  'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke','RenalDiseaseIndicator']

    # add mean of chronic condition numbers in claims by providers
    df_raw['Chronic_number']=df_raw[chronic_list].sum(axis=1)

    df=df_raw[chronic_list+['Provider','Chronic_number']].groupby(['Provider']).mean().reset_index()

    df_raw.drop(chronic_list+['Chronic_number'], axis=1, inplace=True)
    df_raw=pd.merge(df_raw, df, on='Provider', how='left')

    ## Treatment Duration
    df_raw['ClaimStartWeek']=df_raw['ClaimStartDt'].dt.week
    df_raw['ClaimStartYear']=df_raw['ClaimStartDt'].dt.year

    df_raw['DischargeDt']=df_raw['DischargeDt'].fillna(df_raw['AdmissionDt'])

    df_raw['TreatmentDuration']=(df_raw['DischargeDt']-df_raw['AdmissionDt']).astype('timedelta64[D]').astype('int', errors='ignore')
    
    ## Diagnosis and Procedure codes
    # fill in missing values
    df_raw.dropna(how='all', inplace=True)

    codelist=['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
           'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5',
           'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8',
           'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10', 'ClmProcedureCode_1',
           'ClmProcedureCode_2', 'ClmProcedureCode_3', 'ClmProcedureCode_4',
           'ClmProcedureCode_5', 'ClmProcedureCode_6','ClmAdmitDiagnosisCode','DiagnosisGroupCode']

    amountlist=['DeductibleAmtPaid']

    for col in codelist:
        df_raw[col]=df_raw[col].fillna('0')

    for col in amountlist:
        df_raw[col]=df_raw[col].fillna(0)  

    def duplicate_claims(df, index, category, newcolname, threshold=0):
        df_dup=df[[category,index]].drop_duplicates() \
                .groupby([index]).count().reset_index() \
                .rename( columns={category:newcolname}).sort_values(by=newcolname, ascending=False)

        df_dup=pd.merge(df, df_dup, on=index, how='left')
        df_dup[newcolname]=df_dup[newcolname].fillna(0)
        if threshold!=0:        
            counts_total=df_dup[[category]].nunique()[0]
            df_dup['IsTop'+index+'By'+category]=np.where((df_dup[newcolname]/counts_total) \
                    >=threshold, 'Yes', 'No')
        return df_dup

    # Duplicate Codes
    basic_info=['BeneID', 'ClaimStartDt', 'Provider','PotentialFraud']
    df=df_raw[basic_info]
    duplicated = df.duplicated(keep=False)
    some_duplicates = df[duplicated].sort_values(by=df.columns.to_list())

    some_duplicates=pd.merge(some_duplicates.drop_duplicates(),
                             df_raw,
                             on=['BeneID', 'ClaimStartDt', 'Provider', 'PotentialFraud'], 
                             how='left')


    # Add DuplicateClaimCounts
    some_duplicates=duplicate_claims(some_duplicates, 'Provider', 'ClaimID', 'DuplicateClaimCounts', 0)
    df_raw=pd.merge(df_raw, some_duplicates[['Provider','DuplicateClaimCounts']].drop_duplicates(), on='Provider', how='left')
    df_raw['DuplicateClaimCounts']=df_raw['DuplicateClaimCounts'].fillna(0)

    def IsDuplicateCode(df, index, colcode, newcolname):
        df_dup_codes=df.groupby([index, colcode]).count()['ClaimID']
        df_dup_codes=df_dup_codes.sort_values(ascending=False).reset_index()
        df_dup_codes=df_dup_codes.replace('0',np.nan)
        df_dup_codes.dropna(how='any', inplace=True)
        df_dup_codes.rename( columns={'ClaimID':newcolname}, inplace=True)
        ProviderList=df_dup_codes[df_dup_codes[newcolname]>1][index].unique()
        return ProviderList

    # Add IsDuplicateClmAdmDiagCode
    ProviderList=IsDuplicateCode(some_duplicates, 'Provider', 'ClmAdmitDiagnosisCode', 'DuplicateAdmitDiagCode')
    df_raw['IsDuplicateClmAdmDiagCode']=np.where(df_raw['Provider'].isin(ProviderList),'Yes','No')

    # Add IsDuplicateClmAdmDiagCode
    ProviderList=IsDuplicateCode(some_duplicates, 'Provider', 'DiagnosisGroupCode', 'DuplicateDiagnosisGroupCode')
    df_raw['IsDuplicateDiagGrpCode']=np.where(df_raw['Provider'].isin(ProviderList),'Yes','No')

    # Most frequently used diagnosis codes codes
    def most_codes(df):
        df_codes=pd.DataFrame()
        for col in df.columns:
            df_codes[col]=df.groupby([col]).count().iloc[:,1].sort_values(ascending=False)  
        if df_codes.index.isin(['nan']).any():
            df_codes.drop(['nan'], inplace=True)   
        if df_codes.index.isin(['0']).any():
            df_codes.drop(['0'], inplace=True)   
        df_codes=df_codes.sum(axis=1).sort_values(ascending=False)
        return df_codes

    Diagnosiscodelist=['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
           'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5',
           'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8',
           'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10']
    Procedurecodelist=[
           'ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3', 'ClmProcedureCode_4',
           'ClmProcedureCode_5', 'ClmProcedureCode_6']

    codelist=Diagnosiscodelist+Procedurecodelist

    df_diagnosis=df_raw[Diagnosiscodelist]
    df_procedure=df_raw[Procedurecodelist]
    diagnosiscodelist=most_codes(df_diagnosis).head(30).index.tolist()
    procedurecodelist=most_codes(df_procedure).head(15).index.tolist()

    df=pd.DataFrame()
    for c in diagnosiscodelist:
        l=df_diagnosis.isin([c]).sum(axis=1)
        df['Diag_'+c]=l
    df_raw=pd.concat([df_raw, df], axis=1)

    df1=df_raw[Diagnosiscodelist+['Provider']].groupby(['Provider']).mean().reset_index()

    df=pd.DataFrame()
    for c in procedurecodelist:
        l=df_diagnosis.isin([c]).sum(axis=1)
        df['Proc_'+str(c)]=l
    df_raw=pd.concat([df_raw, df], axis=1)
    df2=df_raw[Procedurecodelist+['Provider']].groupby(['Provider']).mean().reset_index()

    df_raw.drop(codelist, axis=1, inplace=True)
    df_raw=pd.merge(df_raw, df1, on='Provider', how='left')
    df_raw=pd.merge(df_raw, df2, on='Provider', how='left')

    ## Summary
    df_raw['AnnualReimbursementAmt']=df_raw['IPAnnualReimbursementAmt'] + df_raw['OPAnnualReimbursementAmt']
    df_raw['AnnualDeductibleAmt']=df_raw['IPAnnualDeductibleAmt'] + df_raw['OPAnnualDeductibleAmt']
    df_raw['TotalPayment']=df_raw['DeductibleAmtPaid']+df_raw['InscClaimAmtReimbursed']

    ## Provider Features
    physicians=pd.melt(df_raw[['BeneID', 'ClaimID', 'Provider', 'Category',
           'AttendingPhysician', 'OperatingPhysician', 'OtherPhysician', 'PotentialFraud']], 
            id_vars=['BeneID','ClaimID', 'Provider','PotentialFraud','Category'], 
            value_vars=df_raw[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']], 
            var_name='Type', 
            value_name='Physician').sort_values(by=['ClaimID','Type']).dropna(subset=['Physician'])

    # Add Patient Counts For Providers
    # the providers with patient counts over 900 or 0.006 are mostly suspective of frauds
    # So add IsTopProviderByPatients (0.005 used to hold more erros)
    df_raw=duplicate_claims(df_raw, 'Provider', 'BeneID', 'PatientCountsByProviders', 0.005)

    # Add Physician Counts For Providers
    df=duplicate_claims(physicians, 'Provider', 'Physician', 'PhysicianCountsByProviders', 0)
    df=df[['Provider','PhysicianCountsByProviders']].drop_duplicates()
    df_raw=pd.merge(df_raw, df, on='Provider', how='left')

    # Add Claim Counts For Providers
    # the providers with claim counts over 1300 or 0.002 of all claims are mostly suspective of frauds
    # so Add IsTopProviderByClaimCounts (0.001 used to hold more erros)
    df_raw=duplicate_claims(df_raw, 'Provider', 'ClaimID', 'ClaimCountsByProviders', 0.001)
    
    ## Time Series Terms
    df_raw['ClaimDayOfWeek']=df_raw['ClaimStartDt'].dt.weekday

    df_raw['Week_start']=df_raw['ClaimStartDt'] - df_raw['ClaimDayOfWeek'] * np.timedelta64(1, 'D')
    df_raw['Week_end']= df_raw['Week_start'] + timedelta(days=6)

    df_raw=pd.merge(df_raw, df_raw.groupby(['ClaimStartYear','ClaimStartWeek','Category'])['ClaimID'].count(), 
                   on=['ClaimStartYear','ClaimStartWeek','Category'], how='left')

    df_raw= df_raw.rename( columns={'ClaimID_x':'ClaimID',
                             'ClaimID_y':'WeeklyClaimCountsByCategory'})

    df_raw['LogWeeklyCounts']=np.log(df_raw['WeeklyClaimCountsByCategory'])

    def ts_terms(df, category):   

        ts_category=df[df['Category']==category]

        ts_features_stat=ts_category[['LogWeeklyCounts','ClaimStartYear','ClaimStartWeek']].drop_duplicates() \
            .sort_values(['ClaimStartYear','ClaimStartWeek'])

        ts_features_stat=ts_features_stat.set_index(['ClaimStartYear','ClaimStartWeek'])

        ## Add AutoRegressive Terms
        ts_features_stat['lag_1']=ts_features_stat['LogWeeklyCounts'].shift(1)
        ts_features_stat['lag_2']=ts_features_stat['LogWeeklyCounts'].shift(2)

        ## Add Moving Average Terms
        ts_features_stat['rolling_mean_1'] = ts_features_stat['LogWeeklyCounts'].rolling(window=1).mean()
        ts_features_stat['rolling_mean_2'] = ts_features_stat['LogWeeklyCounts'].rolling(window=2).mean()

        ## Add Rolling term for 12 weeks
        ts_features_stat['rolling_mean_12'] = ts_features_stat['LogWeeklyCounts'].rolling(window=12).mean()

        ## Add differencing Terms
        ts_features_stat['diff_1'] = ts_features_stat['LogWeeklyCounts'].diff()
        ts_features_stat['diff_2'] = ts_features_stat['LogWeeklyCounts'].diff().diff()

        ts_features_stat['Category']=category
        ts_features_stat=ts_features_stat.reset_index()
        ts_features_stat.fillna(ts_features_stat.mean(), inplace=True)

        return ts_features_stat

    ts_features=df_raw[['ClaimStartDt','Category','LogWeeklyCounts','ClaimStartYear','ClaimStartWeek']]

    # Add ts features for outpatient
    ts_features_outpatient=ts_terms(ts_features, 'Outpatient')
    # Add ts features for inpatient
    ts_features_inpatient=ts_terms(ts_features, 'Inpatient')

    ts_features_allpatients=pd.concat([ts_features_inpatient,ts_features_outpatient])
    ts_features=ts_features[['ClaimStartDt','Category','ClaimStartYear','ClaimStartWeek']]
    ts_features=pd.merge(ts_features, ts_features_allpatients, on=['ClaimStartYear', 'ClaimStartWeek', 'Category'], how='left')
    df_raw[['lag_1', 'lag_2', 
          'rolling_mean_1', 'rolling_mean_2', 'rolling_mean_12', 
          'diff_1', 'diff_2']]=ts_features[['lag_1', 'lag_2', 
          'rolling_mean_1', 'rolling_mean_2', 'rolling_mean_12', 
          'diff_1', 'diff_2']]
    
    ## calculate mean counts of each state, race, gender, age_group and county of claims for each provider
    def pivot_category(df, index, category):
        index_list=df[index].sort_values().unique().tolist()
        category_list=df[category].unique().tolist()

        df_counts=df[[index,category,'ClaimID']]
        df_counts=df_counts.groupby([index,category]).count().reset_index()

        df_out = pd.DataFrame([(i, s) for i in index_list for s in category_list], columns=[index,category])
        df_out=pd.merge(df_out,df_counts,on=[index,category],how='left')
        df_out.fillna(0,inplace=True) 
        df_out=df_out.pivot(index=index, columns=category, values='ClaimID'). \
                reset_index().add_prefix(category+'_').rename(columns={category+'_'+index:index})
        df_out=pd.merge(df, df_out, on=index, how='left' )
        return df_out

    df_raw=pivot_category(df_raw,'Provider','State')    
    df_raw=pivot_category(df_raw,'Provider','Race')    
    df_raw=pivot_category(df_raw,'Provider','Gender')    
    df_raw=pivot_category(df_raw,'Provider','Age_group')    
    df_raw=pivot_category(df_raw,'Provider','County')    
    df_raw=pivot_category(df_raw,'Provider','Category')
    df_raw=pivot_category(df_raw,'Provider','IsDead')
    
    ## drop useless columns
    droplist1=['ClaimStartDt', 'ClaimEndDt', 'DOB', 'DOD',
     'Race', 'State', 'County', 'Category', 'AdmissionDt', 'IsDead',
     'DischargeDt', 'Age', 'Age_group', 'ClaimStartWeek', 'ClaimStartYear',
     'ClaimDayOfWeek', 'Week_start', 'Week_end', 'WeeklyClaimCountsByCategory']

    droplist2=['BeneID', 'Provider', 'ClaimID', 
               'AttendingPhysician', 'OperatingPhysician', 'OtherPhysician', 
               'ClmAdmitDiagnosisCode', 'Gender', 'DiagnosisGroupCode']
    df_raw.drop(droplist1+droplist2,axis=1,inplace=True)
    
    df_raw.fillna(0, inplace=True)
    
    return df_raw

In [None]:
train=preprocessing(df_train)
test=preprocessing(df_test)

# make train and test the same columns
strain=set(train.columns)
stest=set(test.columns)
l_train=list(strain.difference(stest))
l_test=list(stest.difference(strain))
test.drop(columns =l_test, inplace=True)
test[l_train]=0

train.to_csv('data/train_after_processing.csv')
test.to_csv('data/test_after_processing.csv')

In [None]:
# train=pd.read_csv('data/train_after_processing.csv')
# test=pd.read_csv('data/test_after_processing.csv')

In [None]:
## usful functions
def numeric_columns(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64','datetime64[ns]']
    df_numeric = df.select_dtypes(include=numerics)
    return df_numeric.columns

def object_columns(df):
    objects = ['object']
    df_object = df.select_dtypes(include=objects)
    return df_object.columns

def cat_features(df, ls):
    for l in ls:
        df[l]=df[l].astype(str)
    return df

<h1 style="background-color:powderblue;">Modelling</h1>

In [None]:
pd.crosstab(train['PotentialFraud'],train['PotentialFraud'], normalize='all')*100

In [None]:
pd.crosstab(test['PotentialFraud'],test['PotentialFraud'], normalize='all')*100

In [None]:
object_columns(train).tolist()

In [None]:
category_columns=['PotentialFraud',
     'IsDuplicateClmAdmDiagCode',
     'IsDuplicateDiagGrpCode',
     'IsTopProviderByBeneID',
     'IsTopProviderByClaimID']
train_dummies = pd.get_dummies(train, columns=category_columns, sparse=True, drop_first=True)
train_yd = train_dummies['PotentialFraud_Yes']
train_Xd = train_dummies.drop('PotentialFraud_Yes', axis=1) # becareful inplace= False

sc = StandardScaler(with_mean=False)
train_Xd_sc = sc.fit_transform(train_Xd)

enc = OrdinalEncoder()
train[category_columns]= enc.fit_transform(train[category_columns])

train_y_ord = train['PotentialFraud']
train_X_ord = train.drop(['PotentialFraud'], axis=1) # becareful inplace= False

train_X_ord=train_X_ord.fillna(0)
train_Xd=train_Xd.fillna(0)

In [None]:
test_dummies = pd.get_dummies(test, columns=category_columns, sparse=True, drop_first=True)
test_yd = test_dummies['PotentialFraud_Yes']
test_Xd = test_dummies.drop('PotentialFraud_Yes', axis=1) # becareful inplace= False

sc = StandardScaler(with_mean=False)
test_Xd_sc = sc.fit_transform(test_Xd)

enc = OrdinalEncoder()
test[category_columns]= enc.fit_transform(test[category_columns])

test_y_ord = test['PotentialFraud']
test_X_ord = test.drop(['PotentialFraud'], axis=1) # becareful inplace= False

test_X_ord=test_X_ord.fillna(0)
test_Xd=test_Xd.fillna(0)

# train_Xd[train_Xd.isna().any(axis=1)]

In [None]:
print('train_X_ord shape:', train_X_ord.shape)
print('test_X_ord shape:', test_X_ord.shape)

print('train_Xd_sc shape:', train_Xd_sc.shape)
print('test_Xd_sc shape:', test_Xd_sc.shape)

In [None]:
# Fitting RF classifier to the Training set
RF_classifier = RandomForestClassifier(random_state=rand_state)
RF_classifier.fit(train_X_ord, train_y_ord)
# Predicting the Test set probabilities and classes
y_hat_RF       = RF_classifier.predict(test_X_ord)
y_hat_RF_probs = RF_classifier.predict_proba(test_X_ord)
print('accuracy = {}'.format(accuracy_score(test_y_ord, y_hat_RF)))
print('f1 = {}'.format(f1_score(test_y_ord, y_hat_RF)))

In [None]:
# Fitting AdaBoost classifier to the Training set
AdB_classifier = AdaBoostClassifier(random_state=rand_state)
AdB_classifier.fit(train_X_ord, train_y_ord)
y_hat_AdB      = AdB_classifier.predict(test_X_ord)
y_hat_AdB_probs = AdB_classifier.predict_proba(test_X_ord)
print('accuracy = {}'.format(accuracy_score(test_y_ord, y_hat_AdB)))
print('f1 = {}'.format(f1_score(test_y_ord, y_hat_AdB)))

In [None]:
# Fitting Gradient Boosting classifier to the Training set
GBM_classifier = GradientBoostingClassifier(random_state=rand_state, learning_rate=0.1, max_depth=12)
GBM_classifier.fit(train_X_ord, train_y_ord)
y_hat_GBM      = GBM_classifier.predict(test_X_ord)
y_hat_GBM_probs = GBM_classifier.predict_proba(test_X_ord)
print('accuracy = {}'.format(accuracy_score(test_y_ord, y_hat_GBM)))
print('f1 = {}'.format(f1_score(test_y_ord, y_hat_GBM)))

In [None]:
# Fitting XGBoost classifier to the Training set
XGB_classifier = XGBClassifier(random_state=rand_state, eta=0.1, max_depth=12)
XGB_classifier.fit(train_X_ord, train_y_ord)

y_hat_XGB      = XGB_classifier.predict(test_X_ord)
y_hat_XGB_probs = XGB_classifier.predict_proba(test_X_ord)
print('accuracy = {}'.format(accuracy_score(test_y_ord, y_hat_XGB)))
print('f1 = {}'.format(f1_score(test_y_ord, y_hat_XGB)))

In [None]:
CBC_classifier = ctb.CatBoostClassifier(depth=10, random_state=rand_state)
CBC_classifier.fit(train_X_ord, train_y_ord)
y_hat_CBC      = CBC_classifier.predict(test_X_ord)
y_hat_CBC_probs = CBC_classifier.predict_proba(test_X_ord)
print('accuracy = {}'.format(accuracy_score(test_y_ord, y_hat_CBC)))
print('f1 = {}'.format(f1_score(test_y_ord, y_hat_CBC)))

In [None]:
LGB_classifier = lgb.LGBMClassifier()
LGB_classifier.fit(train_X_ord, train_y_ord)
y_hat_LGB     = LGB_classifier.predict(test_X_ord)
y_hat_LGB_probs = LGB_classifier.predict_proba(test_X_ord)
print('accuracy = {}'.format(accuracy_score(test_y_ord, y_hat_LGB)))
print('f1 = {}'.format(f1_score(test_y_ord, y_hat_LGB)))

In [None]:
Logistic_classifier = LogisticRegression(random_state=rand_state)
Logistic_classifier.fit(train_Xd_sc, train_yd)
yd_hat_Logistic      = Logistic_classifier.predict(test_Xd_sc)
yd_hat_Logistic_probs = Logistic_classifier.predict_proba(test_Xd_sc)
print('accuracy = {}'.format(accuracy_score(test_yd, yd_hat_Logistic)))
print('f1 = {}'.format(f1_score(test_yd, yd_hat_Logistic)))

In [None]:
Lasso_classifier = LogisticRegression(penalty="l1", solver="liblinear", random_state=rand_state)
Lasso_classifier.fit(train_Xd_sc, train_yd)
yd_hat_Lasso      = Lasso_classifier.predict(test_Xd_sc)
yd_hat_Lasso_probs = Lasso_classifier.predict_proba(test_Xd_sc)
print('accuracy = {}'.format(accuracy_score(test_yd, yd_hat_Lasso)))
print('f1 = {}'.format(f1_score(test_yd, yd_hat_Lasso)))

In [None]:
Ridge_classifier = LogisticRegression(penalty='l2', solver='liblinear', random_state=rand_state)
Ridge_classifier.fit(train_Xd_sc, train_yd)
yd_hat_Ridge      = Ridge_classifier.predict(test_Xd_sc)
yd_hat_Ridge_probs = Ridge_classifier.predict_proba(test_Xd_sc) 
print('accuracy = {}'.format(accuracy_score(test_yd, yd_hat_Ridge)))
print('f1 = {}'.format(f1_score(test_yd, yd_hat_Ridge)))

In [None]:
ElasticNet_classifier = LogisticRegression(penalty='elasticnet', solver='saga',l1_ratio=0.5, random_state=rand_state)
ElasticNet_classifier.fit(train_Xd_sc, train_yd)
yd_hat_ElasticNet      = ElasticNet_classifier.predict(test_Xd_sc)
yd_hat_ElasticNet_probs = ElasticNet_classifier.predict_proba(test_Xd_sc)
print('accuracy = {}'.format(accuracy_score(test_yd, yd_hat_ElasticNet)))
print('f1 = {}'.format(f1_score(test_yd, yd_hat_ElasticNet)))

In [None]:
KNN_classifier = KNeighborsClassifier(n_neighbors=5)
KNN_classifier.fit(train_Xd_sc, train_yd)
yd_hat_KNN      = KNN_classifier.predict(test_Xd_sc)
yd_hat_KNN_probs = KNN_classifier.predict_proba(test_Xd_sc)
print('accuracy = {}'.format(accuracy_score(test_yd, yd_hat_KNN)))
print('f1 = {}'.format(f1_score(test_yd, yd_hat_KNN)))

In [None]:
# Classification neural network

seed(rand_state)
tensorflow.random.set_seed(rand_state)
Neural_classifier = Sequential()
Neural_classifier.add(Dense(100, input_dim=train_Xd_sc.shape[1], activation='relu',
                kernel_initializer='random_normal'))
Neural_classifier.add(Dense(50,activation='relu',kernel_initializer='random_normal'))
Neural_classifier.add(Dense(25,activation='relu',kernel_initializer='random_normal'))
Neural_classifier.add(Dense(1,activation='sigmoid',kernel_initializer='random_normal'))
Neural_classifier.compile(loss='binary_crossentropy', 
              optimizer=tensorflow.keras.optimizers.Adam(),
              metrics =['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-4, 
    patience=5, verbose=1, mode='auto', restore_best_weights=True)

Neural_classifier.fit(train_Xd_sc,train_yd,validation_data=(test_Xd_sc,test_yd),
          callbacks=[monitor],verbose=2,epochs=30)

yd_hat_Neural = np.round(Neural_classifier.predict(test_Xd_sc))
yd_hat_Neural_probs = Neural_classifier.predict(test_Xd_sc)
print('accuracy = {}'.format(accuracy_score(test_yd, yd_hat_Neural)))
print('f1 = {}'.format(f1_score(test_yd, yd_hat_Neural)))

In [None]:
def blended_predict(train_or_test):
#     if train_or_test=='train':
#         return ((0.5 * CBC_classifier.predict(train_X_ord)) + \
#                 (0.3 * XGB_classifier.predict(train_X_ord)) + \
#                 (0.1 * RF_classifier.predict(train_X_ord)) + \
#                 (0.1 * GBM_classifier.predict(train_X_ord)) )
#     else:
#         return ((0.5 * CBC_classifier.predict(test_X_ord)) + \
#                 (0.3 * XGB_classifier.predict(test_X_ord)) + \
#                 (0.1 * RF_classifier.predict(test_X_ord)) + \
#                 (0.1 * GBM_classifier.predict(test_X_ord)) )
    if train_or_test=='train':
        return ((0.5 * AdB_classifier.predict(train_X_ord)) + \
                (0.3 * Logistic_classifier.predict(train_Xd_sc)) + \
                (0.1 * ElasticNet_classifier.predict(train_Xd_sc)) + \
                (0.1 * CBC_classifier.predict(train_X_ord)) )
    else:
        return ((0.5 * AdB_classifier.predict(test_X_ord)) + \
                (0.3 * Logistic_classifier.predict(test_Xd_sc)) + \
                (0.1 * ElasticNet_classifier.predict(test_Xd_sc)) + \
                (0.1 * CBC_classifier.predict(test_X_ord)) )

def score_list(df, regr,  y_test=None, y_hat_test=None):
    if regr=='Blended Model':
        y_hat_blended_train=blended_predict('train')
        y_hat_blended_test=blended_predict('test')
        
        accuracy=accuracy_score(y_test, np.round(y_hat_blended_test))
        f1=f1_score(y_test, np.round(y_hat_blended_test))
        precision=precision_score(y_test, np.round(y_hat_blended_test), average='binary')
        recall=recall_score(y_test, np.round(y_hat_blended_test), average='binary')
    else:
        accuracy=accuracy_score(y_test, y_hat_test)
        f1=f1_score(y_test, y_hat_test)
        precision=precision_score(y_test, y_hat_test, average='binary')
        recall=recall_score(y_test, y_hat_test, average='binary')    


    df[regr]=[accuracy, f1, precision, recall]
    return df 

df=pd.DataFrame()              
    
# df['Scores']=['Train Score', 'Test Score', 'Train MSE', 'Test MSE', 'Train RMSE', 'Test RMSE']
df=score_list(df, 'Logistic Regression',  test_yd, yd_hat_Logistic)
df=score_list(df, 'Lasso',  test_yd, yd_hat_Lasso)
df=score_list(df, 'Ridge', test_yd, yd_hat_Ridge)
df=score_list(df, 'ElasticNet', test_yd, yd_hat_ElasticNet)
df=score_list(df, 'KNN', test_yd, yd_hat_KNN)
df=score_list(df, 'Random Forest', test_y_ord, y_hat_RF)
df=score_list(df, 'GBM', test_y_ord, y_hat_GBM)
df=score_list(df, 'ADA', test_y_ord, y_hat_AdB)
df=score_list(df, 'XGBoost', test_y_ord, y_hat_XGB)
df=score_list(df, 'Cat Boost', test_y_ord, y_hat_CBC)
df=score_list(df, 'LightGBM', test_y_ord, y_hat_LGB)
df=score_list(df, 'Neural Network', test_yd, yd_hat_Neural)
df=score_list(df, regr='Blended Model', y_test=test_y_ord)

df=df.transpose()
df.columns=['Accuracy', 'F1', 'Precision', 'Recall']
df.sort_values(by='Accuracy', ascending=False)

In [None]:
features = list(X_train.columns)

RF_importance  = RF_classifier.feature_importances_
AdB_importance = AdB_classifier.feature_importances_
GBM_importance = GBM_classifier.feature_importances_
XGB_importance = XGB_classifier.feature_importances_
CBC_importance = CBC_classifier.feature_importances_
LGB_importance = LGB_classifier.feature_importances_
Logistic_importance = Logistic_classifier.coef_
Lasso_importance= Lasso_classifier.coef_
Ridge_importance= Ridge_classifier.coef_
ElasticNet_importance= ElasticNet_classifier.coef_

In [None]:
fig, ax = plt.subplots(figsize=(7.5, 7.5))

fpr1, tpr1, thresholds = roc_curve(test_y_ord, y_hat_RF_probs[:, 1], pos_label=1)
fpr2, tpr2, thresholds = roc_curve(test_y_ord, y_hat_AdB_probs[:, 1], pos_label=1)
fpr3, tpr3, thresholds = roc_curve(test_y_ord, y_hat_GBM_probs[:, 1], pos_label=1)
fpr4, tpr4, thresholds = roc_curve(test_y_ord, y_hat_XGB_probs[:, 1], pos_label=1)
fpr5, tpr5, thresholds = roc_curve(test_y_ord, y_hat_CBC_probs[:, 1], pos_label=1)
fpr6, tpr6, thresholds = roc_curve(test_yd, yd_hat_Logistic_probs[:, 1], pos_label=1)
fpr7, tpr7, thresholds = roc_curve(test_yd, yd_hat_Lasso_probs[:, 1], pos_label=1)
fpr8, tpr8, thresholds = roc_curve(test_yd, yd_hat_Ridge_probs[:, 1], pos_label=1)
fpr9, tpr9, thresholds = roc_curve(test_yd, yd_hat_ElasticNet_probs[:, 1], pos_label=1)
fpr10, tpr10, thresholds = roc_curve(test_yd, yd_hat_KNN_probs[:, 1], pos_label=1)
fpr11, tpr11, thresholds = roc_curve(test_yd, yd_hat_Neural_probs, pos_label=1)
fpr12, tpr12, thresholds = roc_curve(test_y_ord, y_hat_LGB_probs[:, 1], pos_label=1)

roc_auc1 = auc(fpr1, tpr1)
roc_auc2 = auc(fpr2, tpr2)
roc_auc3 = auc(fpr3, tpr3)
roc_auc4 = auc(fpr4, tpr4)
roc_auc5 = auc(fpr5, tpr5)
roc_auc6 = auc(fpr6, tpr6)
roc_auc7 = auc(fpr7, tpr7)
roc_auc8 = auc(fpr8, tpr8)
roc_auc9 = auc(fpr9, tpr9)
roc_auc10 = auc(fpr10, tpr10)
roc_auc11 = auc(fpr11, tpr11)
roc_auc12 = auc(fpr12, tpr12)

# auc = round(metrics.roc_auc_score(y_test, y_pred), 4)

plt.plot(fpr5, tpr5, label='CatBoost  (AUC = %0.2f)' % (roc_auc5))
plt.plot(fpr1, tpr1, label='Random Forest (AUC = %0.2f)' % (roc_auc1))
plt.plot(fpr12, tpr12, label='LightGBM  (AUC = %0.2f)' % (roc_auc12))
plt.plot(fpr4, tpr4, label='XGBoost  (AUC = %0.2f)' % (roc_auc4))
plt.plot(fpr2, tpr2, label='AdaBoost (AUC = %0.2f)' % (roc_auc2))
plt.plot(fpr3, tpr3, label='Gradient Boosting (AUC = %0.2f)' % (roc_auc3))
plt.plot(fpr11, tpr11, label='Neural NetWork  (AUC = %0.2f)' % (roc_auc11))
plt.plot(fpr8, tpr8, label='Ridge  (AUC = %0.2f)' % (roc_auc8))
plt.plot(fpr9, tpr9, label='ElasticNet  (AUC = %0.2f)' % (roc_auc9))
plt.plot(fpr6, tpr6, label='Logistic  (AUC = %0.2f)' % (roc_auc6))
plt.plot(fpr7, tpr7, label='Lasso  (AUC = %0.2f)' % (roc_auc7))
plt.plot(fpr10, tpr10, label='KNN  (AUC = %0.2f)' % (roc_auc10))
plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Random Classifier')   
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='green', label='Perfect Classifier')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.show()

In [None]:
Logistic_importance[0,:].shape

In [None]:
FIM = pd.DataFrame({'Features': train_X_ord.columns 
                   , 'CBC_Feature_importance':CBC_importance
                   , 'GBM_Feature_importance':GBM_importance
                   , 'XGB_Feature_importance':XGB_importance
                   , 'RF_Feature_importance':RF_importance
                   , 'LGB_Feature_importance':LGB_importance
                   , 'AdB_Feature_importance':AdB_importance
                   , 'Logistic_Feature_importance':Logistic_importance[0,:]
                   , 'Ridge_Feature_importance':Ridge_importance[0,:]
                   , 'Lasso_Feature_importance':Lasso_importance[0,:]
                   , 'ElasticNet_Feature_importance':ElasticNet_importance[0,:]})
FIM = FIM.sort_values(by=['AdB_Feature_importance'], ascending=False )
display(FIM[['Features',
             'AdB_Feature_importance',
             'Logistic_Feature_importance',
             'ElasticNet_Feature_importance',
             'CBC_Feature_importance']].head(50))

In [None]:
plt.figure(figsize=(10,8))
plt.title('ADA Boost Feature Importance')
FIM = FIM.sort_values(by=['AdB_Feature_importance'], ascending=False )
sns.barplot(y='Features', x='AdB_Feature_importance', data=FIM.head(30))
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.title('Cat Boost Feature Importance')
FIM = FIM.sort_values(by=['CBC_Feature_importance'], ascending=False )
sns.barplot(y='Features', x='CBC_Feature_importance', data=FIM.head(30))
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.title('XGBoost Feature Importance')
FIM = FIM.sort_values(by=['XGB_Feature_importance'], ascending=False )
sns.barplot(y='Features', x='XGB_Feature_importance', data=FIM.head(30))
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.title('Light Graident Boost Feature Importance')
FIM = FIM.sort_values(by=['LGB_Feature_importance'], ascending=False )
sns.barplot(y='Features', x='LGB_Feature_importance', data=FIM.head(30))
plt.show()

In [None]:
def confusionmatrix(y_actual, y_pred, modellabel=''):
    df=pd.DataFrame()
    df['Actual']=y_actual
    df['Predicted']=y_pred

    confusion_matrix = pd.crosstab(df['Actual'], df['Predicted'], rownames=['Actual'], colnames=['Predicted'])

    sns.heatmap(confusion_matrix, annot=True, fmt='g')

    plt.title('Confusion Matrix for ' +modellabel+' Classifier')
    plt.show()
    
confusionmatrix(test_y_ord, y_hat_AdB, modellabel='ADA boost')
confusionmatrix(test_y_ord, y_hat_CBC, modellabel='Cat boost')
confusionmatrix(test_y_ord, y_hat_XGB, modellabel='XGBoost')
confusionmatrix(test_yd, yd_hat_Logistic, modellabel='Logistic Regression')
confusionmatrix(test_y_ord, y_hat_GBM, modellabel='Light Gradient Boost')
confusionmatrix(test_yd, yd_hat_ElasticNet, modellabel='ElasticNet Regression')
