## Machine Learning Models


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, auc, f1_score, accuracy_score, precision_score, recall_score, roc_curve 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as Sampling_Pipeline
import warnings
from sklearn.linear_model import LogisticRegression
from imblearn.combine import SMOTETomek
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
from tensorflow_addons import losses
import tensorflow as tf
from tensorflow import keras

df = pd.read_csv("transformed_df.csv")
df.rename(columns = {'y':'target'}, inplace = True)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,target
0,58.0,4,1,2,0,2143.0,1,0,2,5.0,8,261.0,1.0,-1,0,3,0
1,44.0,9,2,1,0,29.0,1,0,2,5.0,8,151.0,1.0,-1,0,3,0
2,33.0,2,1,1,0,2.0,1,1,2,5.0,8,76.0,1.0,-1,0,3,0
3,47.0,1,1,3,0,1506.0,1,0,2,5.0,8,92.0,1.0,-1,0,3,0
4,33.0,11,2,3,0,1.0,0,0,2,5.0,8,198.0,1.0,-1,0,3,0


In [None]:
df.info()

In [2]:
indep = df.drop(['target'], axis=1).columns

indep

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'],
      dtype='object')

In [None]:
# Demonstrating over & undersampling
    
oversampler = SMOTE(sampling_strategy=0.5)
undersampler = RandomUnderSampler(sampling_strategy=0.8)

X = df[indep]
y = df.target

print("------------------ Original dataset -----------------")

counter = Counter(y)
for k,v, in counter.items():
    dist = v / len(y) * 100
    print(f"Class {k} has {v} samples with {dist:.2f}%")

print("---------------- With Oversampling --------")


X_ovs, y_ovs = oversampler.fit_resample(X,y)

counter1 = Counter(y_ovs)
for k,v, in counter1.items():
    dist = v / len(y_ovs) * 100
    print(f"Class {k} has {v} samples with {dist:.2f}%")


print("---------------- With Undersampling --------")


X_uns, y_uns = undersampler.fit_resample(X,y)

counter1 = Counter(y_uns)
for k,v, in counter1.items():
    dist = v / len(y_uns) * 100
    print(f"Class {k} has {v} samples with {dist:.2f}%")

print("--------- Combining Oversampling with Undersampling --------")


steps = [('o', oversampler), ('u', undersampler)]
pipeline = Sampling_Pipeline(steps=steps)
X_ovun, y_ovun = pipeline.fit_resample(X, y)
counter3 = Counter(y_ovun)

for k,v, in counter3.items():
    dist = v / len(y_ovun) * 100
    print(f"Class {k} has {v} samples with {dist:.2f}%")


## Logistic Regression

In [None]:
def LogRegOptimizer(X, y, standardize=False, standardize_params=None):
    print('**We will optimize the hyper-parameters of a Logistic Regression model using Randomized Search**\n')

    #function to help us display metrics in a percentage format
    def percentage(x):  
        x = round(x*100,2)
        return (str(x) + "%")

    if standardize:
        print("**Standardizing the data**\n")
        for var in standardize_params:
            X[var] = (X[var] - X[var].mean()) / X[var].std()
        
        print("**Data has been standardized**\n")


    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state=1)

    over = SMOTE()
    X_train, y_train = over.fit_resample(X_train,y_train)

    grid_params = {'C' : [0.001, 0.01, 1, 5, 10, 25, 50, 100], 
                    'penalty' : ["none", "l1", "l2", "elasticnet"],
                    'solver': ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
    }

    logreg=LogisticRegression()
    logreg_cv=RandomizedSearchCV(logreg, grid_params, cv = 10 , verbose = True, n_jobs= -1, scoring = "roc_auc")  #randomized search as opposed to gridsearch, to improve run time
    logreg_cv.fit(X_train,y_train)
    
    y_pred = logreg_cv.predict(X_test)

    print("----------------------------------- Confusion Matrix-----------------------------------")
    print(confusion_matrix(y_test, y_pred))

    print("--------------------------------- Classification Report---------------------------------")
    print(classification_report(y_test, y_pred))


    print("-----------------------------------------Metrics----------------------------------------\n")
    print("tuned hyperparameters :(best parameters) ",logreg_cv.best_params_)
    print("ROC AUC SCORE:" + str(roc_auc_score(y_test, y_pred)))
    print("Gini (Somer's D) coefficient:" + str((roc_auc_score(y_test, y_pred)*2-1)))
    print('Accuracy Score : ' + percentage(accuracy_score(y_test,y_pred)))
    print('Precision Score : ' + percentage(precision_score(y_test,y_pred)))
    print('Recall Score : ' + percentage(recall_score(y_test,y_pred)))
    print('F1 Score : ' + percentage(f1_score(y_test,y_pred)))

    # get the values required to plot a ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    # plot the ROC curve
    plt.plot(fpr, tpr)
    # plot a secondary diagonal line, to plot randomness of model
    plt.plot(fpr, fpr, linestyle = '--', color = 'k')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')

LogRegOptimizer(X = df[indep], y = df.target)

In [None]:
#Let's also try a model where we standardize the numerical variables (not the discrete ones)

standardize_params = ['age', 'balance','duration', 'campaign', 'pdays', 'previous']

LogRegOptimizer(X = df[indep], y = df.target, standardize = True, standardize_params = standardize_params)

## Random Forest


In [None]:

def RandomForestOptimizer(X, y):
    print('**We will optimize the hyper-parameters of a Random Forest model using Grid Search in Python**\n')

    #function to help us display metrics in a percentage format
    def percentage(x):  
        x = round(x*100,2)
        return (str(x) + "%")


    #test train split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify = y)

    #class target frequencies after split
    (unique, counts) = np.unique(y_train, return_counts=True)
    frequencies = np.asarray((unique, counts)).T
    print("Class target frequencies\n" + str(frequencies))

    #combining oversampling and undersampling in a pipeline

    over = SMOTE(sampling_strategy = 0.5)
    under = RandomUnderSampler(sampling_strategy = 0.8)
    
    steps = [('o', over), ('u', under)]
    pipeline = Sampling_Pipeline(steps=steps)
    X_train, y_train = pipeline.fit_resample(X_train, y_train)

    # smt = SMOTETomek(random_state=42, n_jobs = -1)
    # X_train, y_train = smt.fit_resample(X_train, y_train)

    # std_slc = StandardScaler()
    rand_for = RandomForestClassifier()
    pipe = Pipeline(steps=[#('std_slc', std_slc),
                           ('rand_for', rand_for)])

    # Creating Parameter Space
    n_estimators = [100] # not a good hyperparameter to tune -> https://stats.stackexchange.com/questions/348245/do-we-have-to-tune-the-number-of-trees-in-a-random-forest/348246#348246
    max_depth = [5, 8, 15, 25]
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 5] 
    criterion = ['gini', 'entropy']
    parameters = dict(rand_for__n_estimators = n_estimators,
                      rand_for__max_depth = max_depth,  
                      rand_for__min_samples_split = min_samples_split, 
                      rand_for__min_samples_leaf = min_samples_leaf,
                      rand_for__criterion = criterion)

    # Creating a grid search object
    randF_GS = RandomizedSearchCV(pipe, parameters, n_jobs=-1, cv=5, verbose = 1, scoring = "roc_auc")

    # Fitting the grid search
    randF_GS = randF_GS.fit(X_train, y_train)

    #Prediction and scores
    y_pred = randF_GS.predict(X_test)

    #Best parameters
    print("Best parameters: " + str(randF_GS.best_params_))

    #Outputs

    print("-------- Confusion Matrix------")
    print(confusion_matrix(y_test, y_pred))
    
    print("-------- Classification Report------")
    print(classification_report(y_test, y_pred))

    print("--------------- Metrics ---------------")
    print("ROC AUC SCORE:" + str(round(roc_auc_score(y_test, y_pred))),3)
    print("Gini (Somer's D) coefficient:" + str((round(roc_auc_score(y_test, y_pred)*2-1))),3)
    print('Accuracy Score : ' + percentage(accuracy_score(y_test,y_pred)))
    print('Precision Score : ' + percentage(precision_score(y_test,y_pred)))
    print('Recall Score : ' + percentage(recall_score(y_test,y_pred)))
    print('F1 Score : ' + percentage(f1_score(y_test,y_pred)))

    #Feature importances
    importances = list(randF_GS.best_estimator_._final_estimator.feature_importances_)
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X, importances)]  # List of tuples with variable and importance
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True) # Sort the feature importances by most important first
    print("------------Variable Importances ---------")
    [print("Variable: {:20} Importance: {}".format(*pair)) for pair in feature_importances] # Print out the feature and importances 

    # get the values required to plot a ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    # plot the ROC curve
    plt.plot(fpr, tpr)
    # plot a secondary diagonal line, to plot randomness of model
    plt.plot(fpr, fpr, linestyle = '--', color = 'k')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    
RandomForestOptimizer(X = df[indep], y = df.target)


## Neural Networks

In [13]:
X = df.drop('target', 1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
over = SMOTETomek()
X_train, y_train = over.fit_resample(X_train,y_train)

def ANN(X_train, y_train, X_test, y_test, loss, weights):
    model = keras.Sequential([
        keras.layers.Dense(50, input_dim=X_train.shape[1], kernel_initializer='normal',activation='relu'), #input layer 
        keras.layers.Dropout(0.25),
        keras.layers.Dense(50, kernel_initializer='normal', activation='relu'), #hidden layer
        keras.layers.Dropout(0.25),
        keras.layers.Dense(1, activation='sigmoid') # output layer with 1 node; sigmoid function will convert all outputs between 0 and 1
    
    ])

    model.compile(optimizer='adam', loss=loss, metrics=['AUC'])
    
    if weights == -1:
        model.fit(X_train, y_train, epochs=5)
    else:
        model.fit(X_train, y_train, epochs=5, class_weight = weights)
    

    print(model.evaluate(X_test, y_test))
    
    y_pred = model.predict(X_test)
    #rounding the values since the last Dense layers turns the predicted values into probabilities, threshold
 
    y_pred = np.around(y_pred)
    

    #Results

    def percentage(x):  
        x = round(x*100,2)
        return (str(x) + "%")
    
    print("-------- Confusion Matrix------")
    print(confusion_matrix(y_test, y_pred))
    
    print("-------- Classification Report------")
    print(classification_report(y_test, y_pred))

    print("--------------- Metrics ---------------")
    print("ROC AUC SCORE:" + str(round(roc_auc_score(y_test, y_pred),3)))
    print("Gini (Somer's D) coefficient:" + str(round(roc_auc_score(y_test, y_pred)*2-1,3)))
    print('Accuracy Score : ' + percentage(accuracy_score(y_test,y_pred)))
    print('Precision Score : ' + percentage(precision_score(y_test,y_pred)))
    print('Recall Score : ' + percentage(recall_score(y_test,y_pred)))
    print('F1 Score : ' + percentage(f1_score(y_test,y_pred)))


    return y_pred

y_preds = ANN(X_train, y_train, X_test, y_test, 'binary_crossentropy', -1)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[0.3667181730270386, 0.8569298982620239]
-------- Confusion Matrix------
[[8191 1775]
 [ 378  959]]
-------- Classification Report------
              precision    recall  f1-score   support

           0       0.96      0.82      0.88      9966
           1       0.35      0.72      0.47      1337

    accuracy                           0.81     11303
   macro avg       0.65      0.77      0.68     11303
weighted avg       0.88      0.81      0.84     11303

--------------- Metrics ---------------
ROC AUC SCORE:0.77
Gini (Somer's D) coefficient:0.539
Accuracy Score : 80.95%
Precision Score : 35.08%
Recall Score : 71.73%
F1 Score : 47.11%
