In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
def train_test_split(data,split,randomize=True):
    """
    Parameters:
        data: numpy array of the dataset
        split: percentage of the samples required for the training data
        randomize: boolean representing whether you want to randomize the dataset
    Returns:
        train_X: numpy array containing training data
        train_y: numpy array containing the training labels
        test_X: numpy array containing testing data
        test_y: numpy array containing the testing labels
    """
    
    split_index = int(split*len(data))
    
    # randomly shuffles rows of the dataset
    if randomize == True:
        np.random.shuffle(data)
    
    
    X = data[:,:-1]
    y = data[:,-1]
    
    train_X = X[:split_index]
    train_y = y[:split_index]
    
    test_X = X[split_index:]
    test_y = y[split_index:]
    
    return train_X,train_y,test_X,test_y
    

In [3]:
class Perceptron:
    
    def predict(self,X):
        """
        Parameters:
            X: numpy array containing examples from which predictions needs to be made
        Returns:
            predictions: numpy array of the predictions
        """

        activations = X.dot(self.weights[1:])
        activations += self.weights[0]

        predictions  = np.heaviside(activations,0)


        return predictions
    
    def fit(self,train_X,train_y,iterations = 5000,learning_rate = 0.001):
        """
        Updates the weights using gradient descent using a fixed number of iterations and a learning rate
        Parameters:
            train_X: numpy array containing training data
            train_y: numpy array containing the training labels
            iterations: number of iterations for gradient descent
            learning_rate: learning rate for gradient descent
        """
        
        self.weights = np.random.normal(loc=0,scale=0.01,size=train_X.shape[1]+1)
        
        
        for i in tqdm(range(iterations)):

            predictions = self.predict(train_X)

            loss = np.sum(-1*(train_X.dot(self.weights[1:])+self.weights[0])*(train_y-predictions))

            for j,prediction in enumerate(predictions):
                self.weights += learning_rate*(np.insert(train_X[j],0,1))*(train_y[j]-prediction)
                
                
        print("Final loss = {:.2f}".format(loss))

In [4]:
def evaluate(train_X,train_y,test_X,test_y,perceptron):
    """
    Evaluates the training and testing accuracy given a perceptron model
    
    Parameters:
        train_X: numpy array containing training data
        train_y: numpy array containing the training labels
        test_X: numpy array containing testing data
        test_y: numpy array containing the testing labels
    """
    train_predictions = perceptron.predict(train_X)
    test_predictions = perceptron.predict(test_X)

    testing_accuracy = 100*np.sum((test_predictions == test_y))/len(test_y)
    training_accuracy = 100*np.sum((train_predictions == train_y))/len(train_y)

    print("Training accuracy = {:.2f}%".format(training_accuracy))
    print("Testing accuracy = {:.2f}%".format(testing_accuracy))

In [5]:
df=pd.read_csv('Breast_Cancer_Dataset.csv')

In [6]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
df.shape

(569, 32)

In [8]:
df.describe()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,568.0,569.0,568.0,569.0,569.0,569.0,568.0,569.0,569.0,...,569.0,569.0,569.0,567.0,569.0,569.0,568.0,569.0,569.0,569.0
mean,30371830.0,14.116125,19.289649,92.023468,654.889104,0.09636,0.104341,0.088925,0.048919,0.181162,...,16.26919,25.677223,107.261213,881.402469,0.132369,0.254265,0.272455,0.114606,0.290076,0.083946
std,125020600.0,3.517065,4.301036,24.28565,351.914129,0.014064,0.052813,0.079734,0.038803,0.027414,...,4.833242,6.146258,33.602542,570.185385,0.022832,0.157336,0.208711,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.6975,16.17,75.2,420.3,0.08637,0.06492,0.029575,0.02031,0.1619,...,13.01,21.08,84.11,514.65,0.1166,0.1472,0.114475,0.06493,0.2504,0.07146
50%,906024.0,13.355,18.84,86.29,551.1,0.09587,0.09263,0.061545,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.22745,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.15,782.7,0.1053,0.1304,0.131,0.074,0.1957,...,18.79,29.72,125.4,1086.0,0.146,0.3391,0.3835,0.1614,0.3179,0.09208
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [9]:
df.isna().sum()

id                         0
diagnosis                  0
radius_mean                1
texture_mean               0
perimeter_mean             1
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             1
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             1
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 2
smoothness_worst           0
compactness_worst          0
concavity_worst            1
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [10]:
column_means = df. mean()
df = df. fillna(column_means)

  column_means = df. mean()


In [11]:
df.drop(["id"], axis=1,inplace=True)
df.diagnosis = [1 if each == "M" else 0 for each in df.diagnosis]

In [12]:
first_column = df.pop('diagnosis')
df.insert(30, 'diagnosis', first_column)

In [13]:
data = df.values

In [14]:
train_X,train_y,test_X,test_y = train_test_split(data,0.67)
perceptron = Perceptron()
perceptron.fit(train_X,train_y,learning_rate = 0.001,iterations=5000)

100%|██████████| 5000/5000 [01:33<00:00, 53.36it/s]

Final loss = 537195.78





In [15]:
evaluate(train_X,train_y,test_X,test_y,perceptron)

Training accuracy = 90.55%
Testing accuracy = 86.17%


In [16]:
train_X_norm = (train_X - train_X.mean(axis=0)) / train_X.std(axis=0)
test_X_norm= (test_X - test_X.mean(axis=0)) / test_X.std(axis=0) 

In [17]:
train_X_norm,train_y,test_X_norm,test_y = train_test_split(data,0.67)
perceptron = Perceptron()
perceptron.fit(train_X_norm,train_y,learning_rate = 0.001,iterations=5000)

100%|██████████| 5000/5000 [01:23<00:00, 59.73it/s]

Final loss = 546262.41





In [18]:
evaluate(train_X_norm,train_y,test_X_norm,test_y,perceptron)


Training accuracy = 89.24%
Testing accuracy = 89.36%


In [25]:
train_X_shuffle1 = train_X[:, np.random.permutation(train_X.shape[1])]
train_X_shuffle1,train_y,test_X,test_y = train_test_split(data,0.67)
perceptron = Perceptron()
perceptron.fit(train_X_shuffle1,train_y,learning_rate = 0.001,iterations=5000)
evaluate(train_X_shuffle1,train_y,test_X,test_y,perceptron)



100%|██████████| 5000/5000 [01:18<00:00, 64.08it/s]

Final loss = 489241.27
Training accuracy = 90.29%
Testing accuracy = 88.30%





In [26]:
n_splits=10
for i in range(n_splits):
  train_X_shuffle = train_X[:, np.random.permutation(train_X.shape[1])]
  test_X_shuffle = test_X[:, np.random.permutation(test_X.shape[1])]
  train_X_shuffle,train_y,test_X_shuffle,test_y = train_test_split(data,0.67)
  perceptron = Perceptron()
  perceptron.fit(train_X_shuffle,train_y,learning_rate = 0.001,iterations=5000)
  evaluate(train_X_shuffle,train_y,test_X_shuffle,test_y,perceptron)


100%|██████████| 5000/5000 [01:17<00:00, 64.12it/s]


Final loss = 375962.78
Training accuracy = 92.91%
Testing accuracy = 89.89%


100%|██████████| 5000/5000 [01:18<00:00, 63.79it/s]


Final loss = 1040419.93
Training accuracy = 87.93%
Testing accuracy = 89.89%


100%|██████████| 5000/5000 [01:17<00:00, 64.14it/s]


Final loss = 545217.17
Training accuracy = 91.86%
Testing accuracy = 91.49%


100%|██████████| 5000/5000 [01:18<00:00, 63.61it/s]


Final loss = 575213.70
Training accuracy = 91.34%
Testing accuracy = 92.55%


100%|██████████| 5000/5000 [01:17<00:00, 64.39it/s]


Final loss = 4516044.45
Training accuracy = 69.55%
Testing accuracy = 71.28%


100%|██████████| 5000/5000 [01:17<00:00, 64.38it/s]


Final loss = 422485.46
Training accuracy = 91.60%
Testing accuracy = 93.62%


100%|██████████| 5000/5000 [01:18<00:00, 64.00it/s]


Final loss = 605477.61
Training accuracy = 91.08%
Testing accuracy = 92.02%


100%|██████████| 5000/5000 [01:19<00:00, 63.24it/s]


Final loss = 309354.44
Training accuracy = 93.18%
Testing accuracy = 89.89%


100%|██████████| 5000/5000 [01:18<00:00, 63.83it/s]


Final loss = 491939.30
Training accuracy = 91.34%
Testing accuracy = 92.02%


100%|██████████| 5000/5000 [01:18<00:00, 63.69it/s]

Final loss = 486149.01
Training accuracy = 92.39%
Testing accuracy = 92.02%



