In [336]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix

In [337]:
data = pd.read_csv('bc_data.csv')


# Tool Functions

In [338]:
import plotly.graph_objs as go
import plotly.express as px


def plotConfusionMatrix(prediction, target, class_names=['0', '1'], title='Confusion matrix'):
    cm = confusion_matrix(target, prediction)
    labels = {'x': 'Prediction', 'y':'Target'}
    class_names=['0', '1']
    fig = px.imshow(cm,
                    text_auto=True,
                    aspect="auto",
                    labels=labels,
                    title=title,
                    width=500,
                    height=500,
                    x=class_names,
                    y=class_names)
    fig.update_layout(title_x=0.5)
    fig.show()


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def Metrics(prediction, target):
    acc = accuracy_score(target, prediction)
    prec = precision_score(target, prediction)
    recall = recall_score(target, prediction)
    f1 = f1_score(target, prediction)

    return {'Accuracy': acc, 'Precision':prec, 'Recall': recall, 'F1': f1}


# Part 1


In [339]:
train, test = train_test_split(data, test_size=0.25, random_state=0)
Xtrain, ytrain = train.iloc[:, :-1], train.iloc[:, -1]
Xtest, ytest = test.iloc[:, :-1], test.iloc[:, -1]

trainScore_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1'])
trainScore_df = trainScore_df.set_index('Model')
testScore_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1'])
testScore_df = testScore_df.set_index('Model')

In [340]:
Models = {
    'SVM': SVC(),
    'LogisticRegression': LogisticRegression(max_iter=10000),
    'NeuralNetwork': MLPClassifier(hidden_layer_sizes=200, max_iter=500, alpha=0.005)
}

for name in Models:
    model = Models[name]
    model.fit(Xtrain, ytrain)

    ypred = model.predict(Xtrain)
    plotConfusionMatrix(ypred, ytrain, title=name + ' train set')
    trainScore_df.loc[name] = Metrics(ypred, ytrain)

    ypred = model.predict(Xtest)
    plotConfusionMatrix(ypred, ytest, title=name + ' test set')
    testScore_df.loc[name] = Metrics(ypred, ytest)

In [341]:
print('*'*80)
print('Train metric scores:')
print(trainScore_df)
print('*'*80)
print('Test metric scores:')
print(testScore_df)

********************************************************************************
Train metric scores:
                    Accuracy  Precision    Recall        F1
Model                                                      
SVM                 0.912322   0.890411  0.981132  0.933573
LogisticRegression  0.969194   0.970149  0.981132  0.975610
NeuralNetwork       0.919431   0.949416  0.920755  0.934866
********************************************************************************
Test metric scores:
                    Accuracy  Precision    Recall        F1
Model                                                      
SVM                 0.929078   0.900000  1.000000  0.947368
LogisticRegression  0.950355   0.946237  0.977778  0.961749
NeuralNetwork       0.936170   0.955056  0.944444  0.949721


In [342]:
# Create the bar plot
import plotly.graph_objects as go
for column in trainScore_df.columns:
    trace1 = go.Scatter(x=trainScore_df.index, y=trainScore_df[column], name='train')
    trace2 = go.Scatter(x=testScore_df.index, y=testScore_df[column], name='test')
    fig = go.Figure(data=[trace1, trace2])
    fig.update_layout(width=800, height=400, yaxis_title=column)
    # Show the plot
    fig.show()


**ANS**: There is a little overfit in Logisitic Regression model. Other two models' metrics on train set and test set are almost same.

# Part 2

In [343]:
TrainSize = np.arange(0.5, 1, 0.1)

trainScore_df = pd.DataFrame(columns=['Train size', 'Accuracy', 'Precision', 'Recall', 'F1'])
trainScore_df = trainScore_df.set_index('Train size')
testScore_df = pd.DataFrame(columns=['Train size', 'Accuracy', 'Precision', 'Recall', 'F1'])
testScore_df = testScore_df.set_index('Train size')

for trainsize in TrainSize:
    train, test = train_test_split(data, train_size=trainsize, random_state=0)
    Xtrain, ytrain = train.iloc[:, :-1], train.iloc[:, -1]
    Xtest, ytest = test.iloc[:, :-1], test.iloc[:, -1]

    model = SVC().fit(Xtrain, ytrain)

    ypred = model.predict(Xtrain)
    trainScore_df.loc[trainsize] = Metrics(ypred, ytrain)

    ypred = model.predict(Xtest)
    testScore_df.loc[trainsize] = Metrics(ypred, ytest)

print('*' * 80)
print('Train metric scores:')
print(trainScore_df)
print('*' * 80)
print('Test metric scores:')
print(testScore_df)

for column in trainScore_df.columns:
    trace1 = go.Scatter(x=trainScore_df.index, y=trainScore_df[column], name='train')
    trace2 = go.Scatter(x=testScore_df.index, y=testScore_df[column], name='test')
    fig = go.Figure(data=[trace1, trace2])
    fig.update_layout(width=800, height=400, yaxis_title=column, xaxis_title='Train size', title='SVM')
    fig.update_layout(title_x=0.5)
    # Show the plot
    fig.show()


********************************************************************************
Train metric scores:
            Accuracy  Precision    Recall        F1
Train size                                         
0.5         0.896797   0.873786  0.983607  0.925450
0.6         0.905045   0.879167  0.985981  0.929515
0.7         0.903553   0.874552  0.987854  0.927757
0.8         0.917778   0.896104  0.982206  0.937182
0.9         0.915020   0.890805  0.984127  0.935143
********************************************************************************
Test metric scores:
            Accuracy  Precision    Recall        F1
Train size                                         
0.5         0.897163   0.855721  1.000000  0.922252
0.6         0.902655   0.869565  0.992908  0.927152
0.7         0.917160   0.885246  1.000000  0.939130
0.8         0.920354   0.891566  1.000000  0.942675
0.9         0.929825   0.909091  1.000000  0.952381


In [344]:
TrainSize = np.arange(0.5, 1, 0.1)

trainScore_df = pd.DataFrame(columns=['Train size', 'Accuracy', 'Precision', 'Recall', 'F1'])
trainScore_df = trainScore_df.set_index('Train size')
testScore_df = pd.DataFrame(columns=['Train size', 'Accuracy', 'Precision', 'Recall', 'F1'])
testScore_df = testScore_df.set_index('Train size')

for trainsize in TrainSize:
    train, test = train_test_split(data, train_size=trainsize, random_state=0)
    Xtrain, ytrain = train.iloc[:, :-1], train.iloc[:, -1]
    Xtest, ytest = test.iloc[:, :-1], test.iloc[:, -1]

    model = LogisticRegression(max_iter=10000).fit(Xtrain, ytrain)

    ypred = model.predict(Xtrain)
    trainScore_df.loc[trainsize] = Metrics(ypred, ytrain)

    ypred = model.predict(Xtest)
    testScore_df.loc[trainsize] = Metrics(ypred, ytest)

print('*'*80)
print('Train metric scores:')
print(trainScore_df)
print('*'*80)
print('Test metric scores:')
print(testScore_df)


for column in trainScore_df.columns:
    trace1 = go.Scatter(x=trainScore_df.index, y=trainScore_df[column], name='train')
    trace2 = go.Scatter(x=testScore_df.index, y=testScore_df[column], name='test')
    fig = go.Figure(data=[trace1, trace2])
    fig.update_layout(width=800, height=400, yaxis_title=column, xaxis_title='Train size', title='LogisticRegression')
    fig.update_layout(title_x=0.5)
    # Show the plot
    fig.show()

********************************************************************************
Train metric scores:
            Accuracy  Precision    Recall        F1
Train size                                         
0.5         0.975089   0.978261  0.983607  0.980926
0.6         0.964392   0.967593  0.976636  0.972093
0.7         0.967005   0.968000  0.979757  0.973843
0.8         0.968889   0.968421  0.982206  0.975265
0.9         0.966403   0.965625  0.980952  0.973228
********************************************************************************
Test metric scores:
            Accuracy  Precision    Recall        F1
Train size                                         
0.5         0.943262   0.938202  0.970930  0.954286
0.6         0.946903   0.944828  0.971631  0.958042
0.7         0.958580   0.954955  0.981481  0.968037
0.8         0.946903   0.947368  0.972973  0.960000
0.9         0.964912   0.975000  0.975000  0.975000


In [345]:
from sklearn.metrics import roc_auc_score

train, test = train_test_split(data, train_size=0.7, random_state=0)
train, valid = train_test_split(train, train_size=0.8, random_state=0)
Xtrain, ytrain = train.iloc[:, :-1], train.iloc[:, -1]
Xvalid, yvalid = valid.iloc[:, :-1], valid.iloc[:, -1]
Xtest, ytest = test.iloc[:, :-1], test.iloc[:, -1]

LearningRates = np.logspace(-6, 0, 10)
AUC = []
for lr in LearningRates:
    model = MLPClassifier(hidden_layer_sizes=200, max_iter=1000, alpha=lr, solver='adam').fit(Xtrain, ytrain)

    ypred = model.predict_proba(Xvalid)[:, 1]
    AUC.append(roc_auc_score(yvalid, ypred))

trace = go.Scatter(x=LearningRates, y=AUC)
fig = go.Figure(data=trace)
fig.update_layout(width=800, height=400, yaxis_title='AUC', xaxis_title='LearningRate', title='AUC vs Learning rate')
fig.update_layout(title_x=0.5, xaxis_type='log')
# Show the plot
fig.show()



# Part 3

In [346]:
train, test = train_test_split(data, test_size=0.25, random_state=0)
Xtrain, ytrain = train.iloc[:, :-1], train.iloc[:, -1]
Xtest, ytest = test.iloc[:, :-1], test.iloc[:, -1]
# ytrain = ytrain.values
# ytest = ytest.values

model_lr = LogisticRegression(max_iter=10000).fit(Xtrain, ytrain)
model_nn = MLPClassifier(hidden_layer_sizes=200, max_iter=1000, alpha=0.005).fit(Xtrain, ytrain)

Thresholds = np.arange(0.05, 1, 0.05)



In [347]:
ypred = model_lr.predict_proba(Xtest)[:, 1]
LR_df = pd.DataFrame(columns=['Threshold', 'Accuracy', 'Precision', 'Recall', 'F1', 'TPR', 'FPR'])
LR_df = LR_df.set_index('Threshold')

for threshold in Thresholds:
    ypredi = (ypred >= threshold)*1.0
    LR_df.loc[threshold] = Metrics(ypredi, ytest)
    LR_df.loc[threshold, 'TPR'] = LR_df.loc[threshold, 'Recall']
    LR_df.loc[threshold, 'FPR'] = np.mean(ypredi[ytest==0])


ypred = model_nn.predict_proba(Xtest)[:, 1]
NN_df = pd.DataFrame(columns=['Threshold', 'Accuracy', 'Precision', 'Recall', 'F1', 'TPR', 'FPR'])
NN_df = NN_df.set_index('Threshold')

for threshold in Thresholds:
    ypredi = (ypred >= threshold) * 1.0
    NN_df.loc[threshold] = Metrics(ypredi, ytest)
    NN_df.loc[threshold, 'TPR'] = NN_df.loc[threshold, 'Recall']
    NN_df.loc[threshold, 'FPR'] = np.mean(ypredi[ytest == 0])

In [348]:
from sklearn.metrics import auc
fpr = np.hstack([1, LR_df['FPR'], 0])
tpr = np.hstack([1, LR_df['TPR'], 0])
area_LR = auc(fpr, tpr)
trace1 = go.Scatter(x=fpr, y=tpr, name='LogisticRegression')

fpr = np.hstack([1, NN_df['FPR'], 0])
tpr = np.hstack([1, NN_df['TPR'], 0])
area_NN = auc(fpr, tpr)
trace2 = go.Scatter(x=fpr, y=tpr, name='NeuralNetwork')

fig = go.Figure(data=[trace1, trace2])
fig.update_layout(width=800, height=600, yaxis_title='TPR', xaxis_title='FPR', title='ROC curve')
fig.update_layout(title_x=0.5)
# Show the plot
fig.show()


In [349]:
print('AUC of Logistic Regression is {:f}'.format(area_LR))
print('AUC of Neural Network is {:f}'.format(area_NN))

AUC of Logistic Regression is 0.982789
AUC of Neural Network is 0.959477


In [350]:
trace1 = go.Scatter(x=LR_df.index, y=LR_df['Accuracy'], name='Accuracy')
trace2 = go.Scatter(x=LR_df.index, y=LR_df['Precision'], name='Precision')
trace3 = go.Scatter(x=LR_df.index, y=LR_df['Recall'], name='Recall')
trace4 = go.Scatter(x=LR_df.index, y=LR_df['F1'], name='F1')
fig = go.Figure(data=[trace1, trace2, trace3, trace4])
fig.update_layout(width=800, height=600, xaxis_title='Treshold', title='Logistic Regression')
fig.update_layout(title_x=0.5)
# Show the plot
fig.show()

trace1 = go.Scatter(x=NN_df.index, y=NN_df['Accuracy'], name='Accuracy')
trace2 = go.Scatter(x=NN_df.index, y=NN_df['Precision'], name='Precision')
trace3 = go.Scatter(x=NN_df.index, y=NN_df['Recall'], name='Recall')
trace4 = go.Scatter(x=NN_df.index, y=NN_df['F1'], name='F1')
fig = go.Figure(data=[trace1, trace2, trace3, trace4])
fig.update_layout(width=800, height=600, xaxis_title='Treshold', title='Neural Network')
fig.update_layout(title_x=0.5)
# Show the plot
fig.show()

In [356]:
trace1 = go.Scatter(x=LR_df['Recall'], y=LR_df['Precision'], name='Logistic Regression')
trace2 = go.Scatter(x=NN_df['Recall'], y=NN_df['Precision'], name='Neural Network')

fig = go.Figure(data=[trace1, trace2])
fig.update_layout(width=800, height=600, yaxis_title='Precision', xaxis_title='Recall', title='PR curve')
fig.update_layout(title_x=0.5)
# Show the plot
fig.show()