## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Data Preprocessing

In [2]:
data = pd.read_csv('processed.cleveland.data', header = None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [4]:
#filtering out non-numerical columns
catCols = []
for col in data.columns:
    if data[col].dtype == 'object':
        catCols.append(col)

In [5]:
#dumping rows with invalid feature values
print("Shape before removing invalid data " + str(data.shape))
for col in catCols:
    idx = data[data[col] == "?"].index
    data.drop(idx, inplace = True)
print("Shape after removing invalid data " + str(data.shape))

Shape before removing invalid data (303, 14)
Shape after removing invalid data (297, 14)


In [6]:
#converting categorical columns into numericals cols
for col in catCols:
    data[col] = data[col].astype('float64')

In [7]:
#Splitting the data into features and labels
X = data.drop(13, axis = 1)
y = data[13]

In [8]:
#Converting multiclass labels to binary labels
y = np.where(y == 0, -1, 1)

In [9]:
#Splitting the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
#Normalizing the dataset
mu = X_train.mean(axis = 0)
std = X_train.std(axis = 0)
X_train = (X_train - mu) / std
X_val = (X_val - mu) / std

In [11]:
#Concatenating ones for the training and testing set
X_train = np.concatenate((X_train, np.ones((X_train.shape[0],1))), axis = 1)
X_val = np.concatenate((X_val, np.ones((X_val.shape[0], 1))), axis = 1)

## Modelling

In [12]:
#Created a model using OOP
class lda():
    #Initialization
    def __init__(self):
        #Initialize all the required params to None
        self.w = None #weights
        self.pos = None #positive class params
        self.neg = None #negative class params
        self.mp = None #positive class mean
        self.mn = None #negative class mean
        self.sp = None #postive class variance
        self.sn = None #negative class variance
        self.sw = None #within class scatter matrix
        self.pred = None #predictions
        
    #Fitting the model    
    def fit(self, X_train, y_train):
        self.pos = X_train[y_train > 0] #get all postive class features
        self.neg = X_train[y_train < 0] #get all negative class features
        self.mp = self.pos.mean(axis = 0) #compute postive class mean
        self.mn = self.neg.mean(axis = 0) #compute negative class mean
        self.sp = np.dot((self.pos - self.mp).T, (self.pos - self.mp)) #compute postive class variance
        self.sn = np.dot((self.neg - self.mn).T, (self.neg - self.mn)) #compute negative class variance
        self.sw = self.sp + self.sn #compute within class scatter matrix
        swInv = np.linalg.pinv(self.sw) #compute within class scatter matrix inverse
        self.w = np.dot(swInv, (self.mp - self.mn)) #compute weights
    
    #prediction
    def predict(self, X_val, threshold = 0):
        #threshold is set to 0 by default.
        self.pred = np.dot(X_val, self.w) #compute prediction
        self.pred = np.where(self.pred >= threshold, 1, -1) #assign labels based on predictions
        return self.pred
    
    #returns weight
    def getWeights(self):
        return self.w

## Working

In [13]:
flda = lda()

In [14]:
flda.fit(X_train, y_train)

In [15]:
pred = flda.predict(X_val)
pred

array([-1, -1, -1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1, -1, -1,
        1,  1, -1,  1,  1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,
       -1, -1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1, -1, -1,
        1, -1, -1,  1,  1, -1, -1,  1,  1])

## Checking the accuracy of the model

In [16]:
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

          -1       0.91      0.89      0.90        36
           1       0.84      0.88      0.86        24

    accuracy                           0.88        60
   macro avg       0.88      0.88      0.88        60
weighted avg       0.88      0.88      0.88        60

