## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

## Data Preprocessing

In [2]:
data = pd.read_csv('breast-cancer-wisconsin.data', header = None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
#filtering out non-numerical columns
catCols = []
for col in data.columns:
    if data[col].dtype == 'object':
        catCols.append(col)

In [5]:
#dumping rows with invalid feature values
print("Shape before removing invalid data " + str(data.shape))
for col in catCols:
    idx = data[data[col] == "?"].index
    data.drop(idx, inplace = True)
print("Shape after removing invalid data " + str(data.shape))

Shape before removing invalid data (699, 11)
Shape after removing invalid data (683, 11)


In [6]:
#converting categorical columns into numericals cols
for col in catCols:
    data[col] = data[col].astype('float64')

In [7]:
#dumping the 0 column as it is just the ID of the patient
data.drop(0, axis = 1, inplace = True)

In [8]:
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2


In [9]:
#splitting the data into features and labels
X = data.drop(10, axis = 1)
y = data[10]

In [10]:
#converting labels into +1 for malignant and 0 for benign
y = np.where(y == 4, 1, 0)

In [11]:
#Splitting the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [12]:
#Normalizing the dataset
mu = X_train.mean(axis = 0)
std = X_train.std(axis = 0)
X_train = (X_train - mu) / std
X_val = (X_val - mu) / std

In [13]:
#Converting the dataset into numpy arrays
X_train = X_train.to_numpy()
X_val = X_val.to_numpy()

## Modelling

In [14]:
#Created a model using OOP
class PGM():
    #Initialization
    def __init__(self):
        #Initialize all the required params to None
        self.w = None
        self.w0 = None
        self.cov = None
        self.s1 = None
        self.s2 = None
        self.mu1 = None
        self.mu2 = None
        self.gamma1 = None
        self.gamma2 = None
        self.n = None
        self.n1 = None
        self.n2 = None
        self.x1 = None
        self.x2 = None
        self.pred = None
    
    #Fitting the model 
    def fit(self, X_train, y_train):
        self.n = X_train.shape[0] #get number of examples
        self.n1 = np.count_nonzero(y_train == 1) #get number of c1 examples
        self.n2 = np.count_nonzero(y_train == 0) #get number of c1 examples
        self.gamma1 = self.n1 / self.n #get gamma1(prob of C1)
        self.gamma2 = self.n2 / self.n #get gamma2(prob of C2)
        self.x1 = X_train[np.where(y_train == 1)] #get all c1 examples
        self.x2 = X_train[np.where(y_train == 0)] #get all c1 examples
        self.mu1 = self.x1.sum(axis = 0) / self.n1 #get mean vector of class1
        self.mu2 = self.x2.sum(axis = 0) / self.n2 #get mean vector of class2
        self.s1 = np.dot((self.x1 - self.mu1).T, (self.x1 - self.mu1)) #get s1
        self.s2 = np.dot((self.x2 - self.mu2).T, (self.x2 - self.mu2)) #get s2
        self.cov = (self.s1 + self.s2) / self.n #calculate covariance matrix using s1 and s2
        covInv = np.linalg.pinv(self.cov) #get inverse of covariance matrix
        self.w = np.dot(covInv, (self.mu1 - self.mu2)) #get optimal weights
        firstTermW0 = -0.5 * np.dot(self.mu1.T, np.dot(covInv, self.mu1)) #get the first part of w0
        secondTermW0 = 0.5 * np.dot(self.mu2.T, np.dot(covInv, self.mu2)) #get the second part of w0
        thirdTermW0 = np.log(self.gamma1 / self.gamma2) #get the third part of w0
        self.w0 = firstTermW0 + secondTermW0 + thirdTermW0 #get w0
    
    #Prediction
    def predict(self, X_val):
        a = np.dot(X_val, self.w) + self.w0 #get predictions
        self.pred = self.__logit__(a) #use logit to get probability
        self.pred = np.where(self.pred >= 0.5, 1, 0) #assign classes based on probability
        return self.pred
    
    #Logit function
    def __logit__(self, a):
        return 1 / (1 + np.exp(-a))
    
    #Return weights
    def getWeights(self):
        return self.w, self.w0

## Working

In [15]:
pgm = PGM()

In [16]:
pgm.fit(X_train, y_train)

In [17]:
pred = pgm.predict(X_val)

## Checking the accuracy of the model

In [18]:
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95        79
           1       0.98      0.86      0.92        58

    accuracy                           0.93       137
   macro avg       0.94      0.92      0.93       137
weighted avg       0.94      0.93      0.93       137

