In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 24 17:05:51 2019

@author: nikhilyadav
"""


import pandas as p
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix 


#Function used to map malignant and benign to '0' and '1' respectively
def process_csv(filename):
    
    
    #the .replace function is used to replace all the M's with 0's and all the B's with 1's
    filename.replace(['M', 'B'],[1, 0], inplace = True)

    return (filename)




#Funciton is used to split the data into train data and test data
def preprocess_data(x ,y):
    
    
    #xtrain contains the training data from the csv excluding the label feature, i.e. M or B
    #ytrain contains the training data from the csv with only the label feature, i.e. M or B
    #xtest and ytest are testing data sets
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 0)
    
    #data normalization
    x_normalize = preprocessing.normalize(xtrain)

    x_test = preprocessing.normalize(xtest)
    #addition of axis to reshape the array into desired layout
    ytrain = ytrain[:,np.newaxis]
  
    ytest = ytest[:,np.newaxis]


    return (x_normalize, ytrain, ytest,x_test)

    
epochs = 17000
learningrate = 0.3


def sigmoid(z):
    return 1 / (1 + np.exp(-z)) 

#cost calculation for training and validation data
def train_model(xtrain, ytrain):    
    
    w = np.random.randn(xtrain.shape[1], 1)*0.01
    b = 0
    
    # split validation data with 10% for validation
    xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, test_size = 0.1, random_state = 0)
    m = xtrain.shape[0]

    mvalid = xvalid.shape[0]

    losstrack =[]
    losstrack_valid = []
    
    for epoch in range(epochs):
        #hypothesis
        z = np.dot( xtrain,w) + b
        p = sigmoid(z)
        #training cost
        cost = -(np.sum(np.multiply(np.log(p), ytrain) + np.multiply((1 - ytrain), np.log(1 - p))))/m
        losstrack.append(np.squeeze(cost))
        #training cost end
        
        # validation cost
        zvalid = np.dot(xvalid,w) + b
        pvalid = sigmoid(zvalid)
        
        cost = -(np.sum(np.multiply(np.log(pvalid), yvalid) + np.multiply((1 - yvalid), np.log(1 - pvalid))))/mvalid
        losstrack_valid.append(np.squeeze(cost))
        # validation cost end
        
        dz = p-ytrain
        
        #gradient
        dw = (1 / m) * np.dot(xtrain.T, dz)

        db = (1 / m) * np.sum(dz)
        
        #update weights
        w = w - learningrate * dw
        b = b - learningrate * db
        

        
                                          
    
    validation_prediction = predict(w,b,yvalid,xvalid) 
    accuracy, precision, recall,f1_score = evaluate_accuracy(validation_prediction, yvalid) 
    ''''''print("Validation Accuracy: ",accuracy)
    print("Validation Precision: ",precision)
    print("Validation Recall: ",recall)
    print("Validation f1_score :", f1_score)
    
    plt.plot(losstrack, 'b', label = "Training Set")
    plt.plot(losstrack_valid,'r', label = "Validation Set")
    plt.xlabel("epochs")
    plt.ylabel("loss")
    plt.title("Loss vs Epoch")
    plt.legend()
    
    plt.show()
    
    return(w, b)

#
def predict(w, b, ytest, xtest):

    z = np.dot(xtest,w) + b
    prediction = sigmoid(z)
    
    for i in range(len(ytest)):    
        if(prediction[i]>0.5):
            prediction[i] = 1
        else:
            prediction[i] = 0
        
    return prediction
        

def evaluate_accuracy(prediction, ytest):
    
    tn, fp, fn, tp = confusion_matrix(prediction, ytest).ravel()

    accuracy = (tp+tn)/(tp+tn+fp+fn)
    precision = tp/(tp+fp)
    recall = tp/(tp + fn)
    f1_score = (2*recall*precision)/(recall+precision)

    return accuracy, precision, recall ,f1_score


if __name__ == "__main__":
    df = p.read_csv("wdbc.csv", header = None)
    
    df = process_csv(df)
    
    y = df[df.columns[1]]
    x = df[df.columns[2:]]
    
    xtrain, ytrain, ytest, xtest = preprocess_data(x, y)
    w,b, = train_model(xtrain, ytrain)
    test_predictions = predict(w, b, ytest,xtest)
    accuracy, precision, recall,f1_score = evaluate_accuracy(test_predictions, ytest)
    
    print("Accuracy : ",accuracy)
    print("Precision :", precision)
    print("Recall : ", recall)
    print("f1_score :", f1_score)

Validation Accuracy:  0.9782608695652174
Validation Precision:  1.0
Validation Recall:  0.9473684210526315
Validation f1_score : 0.972972972972973


<Figure size 640x480 with 1 Axes>

Accuracy :  0.9035087719298246
Precision : 0.7872340425531915
Recall :  0.9736842105263158
f1_score : 0.8705882352941177
