## Importing necessary libraries and loading our data into a numpy array
As KNN, data-scaling and importing datasets has already been covered in previous assignments, I will be importing sklearn modules for those functionalities.

Linear Discriminant Analysis has been implemented from scratch.

In [1]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np

data = datasets.load_iris()

### Implement custom LDA class

In [2]:
class LinearDiscriminantAnalysis:
    def __init__(self, num_classes):
        self.eigenvectors = None
        self.num_classes = None
        if (num_classes is not None):
            self.num_classes = num_classes
    
    def process(self, data, labels):
        cov_w = 0
        height, width = data.shape
        cov_t = np.cov(data.T)*(height-1)
        
        uc = np.unique(labels)
        n_unique = len(uc)
        
        for i in range(0, n_unique):
            cov_items = np.flatnonzero(uc[i] == labels)
            cov_w += (len(cov_items)-1) * np.cov(data[cov_items].T)
        
        cov_b = cov_t - cov_w
        pinv_dot = np.linalg.pinv(cov_w).dot(cov_b)
        dump, eigenvectors = np.linalg.eigh(pinv_dot)
        
        # print(eigenvectors.shape)
        
        principal = data.dot(eigenvectors[:,::-1][:,:self.num_classes])
        
        # print(principal.shape)
        
        if self.num_classes == 2:
            if labels is not None:
                unique_labels = np.unique(labels)
                zip_dict = zip(['m', 'y', 'c'], unique_labels)
                for clr, lbl in zip_dict:
                    flat_ids = np.flatnonzero(labels==lbl)
                    c_data = principal[flat_ids]
                    argX = c_data[:,0]
                    argY = c_data[:,1]
                    plt.scatter(argX, argY, c=clr)
            else:
                plt.scatter(principal[:,0], principal[:,1])
            plt.show()
        
        return principal

### Pre-processing our data
Here, I have split the data into a 70:30 train:test split. Two datasets have been prepared, one without LDA and one with LDA (X_train and X_train_modified). All data used has been normalised.

In [3]:
y, X = data.target, data.data[:, :]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 31)

scaler = preprocessing.StandardScaler().fit(X_train)
LDA = LinearDiscriminantAnalysis(num_classes=3)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)
X_train_modified, X_test_modified = LDA.process(X_train, y_train), LDA.process(X_test, y_test)

## TESTING ACCURACY ##

knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_modified, y_train)
y_predicted_lda = knn.predict(X_test_modified)
print("Accuracy score with LDA:   ", accuracy_score(y_test, y_predicted_lda))

knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_predicted = knn.predict(X_test)
print("Accuracy score without LDA:", accuracy_score(y_test, y_predicted))

# Accuracy score comes out to be:
# 0.977 when using LDA and
# 0.955 when not using LDA

Accuracy score with LDA:    0.9777777777777777
Accuracy score without LDA: 0.9555555555555556
