In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

## Perceptron Algorithm
Perceptron class implementing the algorithm

In [27]:
def step(z):
    return np.where(z > 0 , 1, 0)

In [28]:
class Perceptron:
    #initialising the class perceptron with parameters learning rate and n_iters denoting the epochs or no of iterations
    def __init__(self, learning_rate=0.01 , n_iters = 100):
        self.lr = learning_rate 
        self.n_iters = n_iters
        self.activation_func = step
        self.weights = None #defining the weights and bias attributes assigning them as None 
        self.bias = None

    # Fitting function of the class perceptron
    def fit(self,X,y):
        n_samples= X.shape #defining number of samples equal to the number of rows in X(i.e no of training samples)
        n_features = X.shape[1] #defining number of features equal to the number of columns in X
        
        #initialising parameters(i.e W(weights) and w_0(bias)) and setting them equal to 0 or 0 vector
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        y_ = step(y)
        
        #learning weights
        for _ in range(self.n_iters): #A loop for no of iterations
                for idx,x_i in enumerate(X): 
                    linear_output = np.dot(x_i, self.weights) + self.bias #evaluating weighted sum 
                    y_predicted = self.activation_func(linear_output) #activating using step function
                
                    #Update Rule
                    update = self.lr * (y_[idx] - y_predicted) 
                    self.weights += update * x_i
                    self.bias += update #Updating bias 
    
    
    def predict(self,X):
        linear_output = np.dot(X, self.weights) + self.bias #predicting the trained dataset
        y_predicted = self.activation_func(linear_output)
        return y_predicted


In [29]:
#Confusion matrix calculation for the dataset
def confusion_matrix(y_test,y_pred):
    mat = np.zeros((2,2))
    for i in range(len(y_test)):
        if(y_test[i] == 0 and y_pred[i] == 0):
            mat[0][0] += 1
        if(y_test[i] == 0 and y_pred[i] == 1):
            mat[0][1] += 1
        if(y_test[i] == 1 and y_pred[i] == 0):
            mat[1][0] += 1
        if(y_test[i] == 1 and y_pred[i] == 1):
            mat[1][1] += 1
    return mat

## Importing Data and Data Analysis

In [30]:
df=pd.read_csv("Dsata Set for Assignment 1 (1).csv") #Importing the data
df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [31]:
df.drop("id",axis=1,inplace=True) #Dropping id feature as it is meaningless in terms of training
#Replacing M and B with 1 and 0(numeric data) according to the activation function
df["diagnosis"].replace('M',1,inplace=True) 
df["diagnosis"].replace('B',0,inplace=True)
df

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,1,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,1,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,1,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [32]:
#Dropping tuples or rows containing nan values
columns = list(df)
df = df.dropna()
df 

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,1,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,1,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,1,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


## Splitting the Data into train set and test set

In [33]:
acc = 0
pre = 0
rec = 0
for i in range(10):
    df_train = df.sample(frac = 2/3) #Taking 2/3 rd of the data for training
    df_test = df.drop(df_train.index) # Taking 1/3rd of the data for testing 
    
    #Splitting into target and training values
    X_cols = df.loc[:,df.columns!="diagnosis"].columns
    X_train = df_train[X_cols]
    X_train = X_train.to_numpy()
    X_test = df_test[X_cols]
    X_test = X_test.to_numpy()
    y_train = df_train["diagnosis"]
    y_train = y_train.to_numpy()
    y_test = df_test["diagnosis"]
    y_test = y_test.to_numpy()
    
    #Implementing perceptron
    p = Perceptron(learning_rate = 0.01,n_iters = 1000)
    p.fit(X_train,y_train)
    predictions = p.predict(X_test)
    
    #Confusion Matrix calculations for accuracy,precision and recall
    c_mat = confusion_matrix(y_test,predictions)
    acc = acc + (c_mat[0][0] + c_mat[1][1])*100 /(c_mat[0][0] + c_mat[0][1] + c_mat[1][0] + c_mat[1][1])
    pre = pre + (c_mat[0][0]*100) / (c_mat[0][1] + c_mat[0][0])
    rec = rec + (c_mat[0][0])*100 / (c_mat[0][0] + c_mat[1][0])
    
print("Accuracy : ", acc/10)
print("Precision :", pre/10)
print("Recall :", rec/10)

Accuracy :  89.68085106382978
Precision : 90.55615986327074
Recall : 93.29803700752261


## Perceptron Classifier (PM2)
Differentiating by changing the order of the rows

In [17]:
acc1 = 0
pre1 = 0
rec1 = 0
for i in range(10):
    df_train = df.sample(frac = 0.67) #Taking 2/3 rd of the data for training
    df_train = df_train.sample(frac = 1) #Randomly arranging the rows
    df_test = df.drop(df_train.index) #Taking 1/3rd of the data for testing
    
    #Splitting into target and training values
    X_cols = df.loc[:,df.columns!="diagnosis"].columns
    X_train = df_train[X_cols]
    X_train = X_train.to_numpy()
    X_test = df_test[X_cols]
    X_test = X_test.to_numpy()
    y_train = df_train["diagnosis"]
    y_train = y_train.to_numpy()
    y_test = df_test["diagnosis"]
    y_test = y_test.to_numpy()
    
    #Implementing perceptron
    p = Perceptron(learning_rate = 0.01,n_iters = 1000)
    p.fit(X_train,y_train)
    predictions = p.predict(X_test)
    
    #Confusion Matrix calculations for accuracy,precision and recall
    c_mat = confusion_matrix(y_test,predictions)
    acc1 = acc1 + (c_mat[0][0] + c_mat[1][1])*100 /(c_mat[0][0] + c_mat[0][1] + c_mat[1][0] + c_mat[1][1])
    pre1 = pre1 + (c_mat[0][0]*100) / (c_mat[0][1] + c_mat[0][0])
    rec1 = rec1 + (c_mat[0][0])*100 / (c_mat[0][0] + c_mat[1][0])
    
print("Accuracy : ", acc/10)
print("Precision :", pre/10)
print("Recall :", rec/10)

Accuracy :  87.65957446808511
Precision : 89.49329002020644
Recall : 91.64417897695066


## Is the data linearly separable?
Yes, we can say that the data is linearly separable as for 100000 epochs it is giving an accuracy of around 96%

## Differences between PM1 and PM2

Both of the classifiers perform the same, since they differ only in training set and testing set