### Import Libraries

In [6]:
import numpy as np
import pandas as pd
import sympy as sp

### Required Functions

In [2]:
def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)

In [3]:
def mean_vector(x):
    return np.mean(x, axis=0)

In [4]:
def euclidean_distance(p1, p2):
    p1 = np.array(p1)
    p2 = np.array(p2)
    return np.sqrt(np.sum((p1 - p2)**2))

### Load Data

In [7]:
data = pd.read_csv("gender.csv")
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data.rename(columns = {'Unnamed: 1': 'class'}, inplace = True)

In [11]:
data

Unnamed: 0,class,0,1,2,3,4,5,6,7,8,...,118,119,120,121,122,123,124,125,126,127
0,male,-0.066420,0.151611,0.027740,0.052771,-0.066105,-0.041232,-0.002637,-0.158467,0.130467,...,0.025989,-0.001087,0.027260,-0.046754,-0.118619,-0.163774,-0.000590,-0.076400,0.107497,0.001567
1,male,-0.030614,0.049667,0.008084,-0.050324,0.007649,-0.063818,-0.019530,-0.119905,0.186553,...,0.044229,-0.023900,-0.028108,0.040618,-0.146579,-0.141244,0.016162,0.017638,0.080610,-0.015930
2,male,-0.096178,0.061127,0.035326,-0.035388,-0.090728,-0.018634,-0.024315,-0.139786,0.052211,...,0.111141,0.059436,-0.029222,0.042115,-0.222173,-0.116908,0.093428,0.017391,0.057652,0.086116
3,male,-0.103057,0.085044,0.078333,-0.035873,-0.028163,0.004924,0.007829,-0.017016,0.114907,...,0.100793,-0.002644,-0.023388,0.029497,-0.139830,-0.119243,0.005306,-0.015100,0.161575,0.062462
4,male,-0.125815,0.120046,0.023131,-0.042901,0.038215,-0.049677,-0.054258,-0.130758,0.173457,...,0.090197,0.067527,0.039926,0.047469,-0.056852,-0.076700,0.004966,0.028171,0.026041,0.084135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,female,-0.164731,0.064301,0.058630,-0.017420,-0.157600,-0.022536,0.002864,-0.072739,0.030554,...,0.095115,0.007198,-0.004655,0.023957,-0.170753,-0.136630,0.041614,0.031600,0.019064,0.004384
796,female,-0.095308,0.051095,0.092913,-0.101745,-0.083153,-0.028159,0.009090,-0.114513,0.157421,...,0.056078,0.119846,0.087470,0.017481,-0.096594,-0.084553,0.037709,0.030732,-0.083713,0.064970
797,female,-0.202852,0.037039,0.079731,-0.047156,-0.140062,-0.080246,0.057668,-0.122083,0.165443,...,0.066954,0.035684,-0.023112,-0.030452,-0.154243,-0.188270,0.071086,0.037384,-0.006257,0.039977
798,female,-0.088300,0.063530,0.049627,-0.026011,-0.172773,0.086218,0.042710,-0.161852,0.185083,...,0.039460,0.067547,0.040426,0.028007,-0.154515,-0.127736,0.046967,0.009701,-0.016942,0.048071


### Train Test Split

In [13]:
def train_test_split(dataset):
    classes = dataset['class'].unique()
    test, train = pd.DataFrame(), pd.DataFrame()
    for c in classes:
        class_data = dataset[dataset['class'] == c]
        test = pd.concat([test, class_data.iloc[:10]], ignore_index=True)
        train = pd.concat([train, class_data.iloc[10:]], ignore_index=True)
    X_train, X_test = train.iloc[:, 1:].values, test.iloc[:, 1:].values
    y_train, y_test = train.iloc[:, 0].values, test.iloc[:, 0].values
    return X_train, X_test, y_train, y_test

### LDA Class

In [9]:
class LDA():
    def __init__(self, n_components = 2):
        self.n_components = n_components

    def fit(self, train: pd.DataFrame, test: pd.DataFrame):
        classes = test['class'].unique()
        self.wc_scatter_matrices = []
        self.mean_vectors = []

        for c in classes:
            class_data_set = train[test['class'] == c]
            self.mean_vectors.append(mean_vector(class_data_set))
        
        for i in range(len(classes)):
            si = np.zeros((len(classes), len(classes)))
            for j in range(len(classes[i])):
                required_vector = classes[i][j] - self.mean_vectors[i]
                si += np.dot(required_vector, required_vector.T)
            self.wc_scatter_matrices.append(si)
        
        self.sw = np.zeros((len(classes), len(classes)))
        for i in range(len(classes)):
            self.sw += self.wc_scatter_matrices[i]

        self.sb = np.zeros((len(classes), len(classes)))
        for i in range(len(classes)):
            mean_vector = mean_vector(train)
            required_vector = self.mean_vectors[i] - mean_vector
            self.sb += np.dot(required_vector, required_vector.T)
            
        self.linear_discriminants = np.dot(np.linalg.inv(self.sw), self.sb)
        self.eigen_values, self.eigen_vectors = np.linalg.eig(self.linear_discriminants)

        self.eig_pairs = [(np.abs(self.eigen_values[i]), self.eigen_vectors[:,i]) for i in range(len(self.eigen_values))]
        self.eig_pairs = sorted(self.eig_pairs, key=lambda k: k[0], reverse=True)

        self.selected_eigen_vectors = []
        for i in range(self.n_components):
            self.selected_eigen_vectors.append(self.eig_pairs[i][1].reshape(len(classes),1))
        
        self.final_matrix = np.hstack(self.selected_eigen_vectors)
    
    def transform(self, train: pd.DataFrame, test: pd.DataFrame):
        return np.dot(train, self.final_matrix)

### KNN Classifier Class

In [10]:
class KNN():
    def __init__(self, k):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def knn(self, test_point):
        distances = []
        for i in range(self.X_train.shape[0]):
            train_point = self.X_train[i, :]
            dist = euclidean_distance(test_point, train_point)
            distances.append((dist, self.y_train[i]))
        distances.sort()
        return distances[:self.k]
    
    def predict(self, X_test):
        y_pred = []
        for i in range(X_test.shape[0]):
            test_point = X_test[i, :]
            k_nearest_neighbours = self.knn(test_point)
            nearest_labels = pd.DataFrame([label for _,label in k_nearest_neighbours])
            y_pred.append(nearest_labels.mode()[0][0])
        return np.array(y_pred)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data)

### Linear Discriminant Analysis

In [16]:
lda = LDA(n_components=20)
lda.fit(X_train, X_test)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices