In [108]:
import numpy as np
import pandas as pd

In [109]:
def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)

In [110]:
def covariance_matrix(data, mean_vector):
        z_matrix = data - mean_vector
        cov_matrix = np.dot(z_matrix.T, z_matrix) / (data.shape[0] - 1)
        return cov_matrix

In [111]:
def euclidean_distance(p1, p2):
    p1 = np.array(p1)
    p2 = np.array(p2)
    return np.sqrt(np.sum((p1 - p2)**2))

## Classes

### PCA

In [112]:
class PCA:
    def __init__(self, n_components=0):
        self.d = n_components

    def fit(self, X):
        self.mean_vector = np.mean(X, axis = 0)
        self.cov_mat = covariance_matrix(X, self.mean_vector)
        self.eigen_values, self.eigen_vectors = np.linalg.eig(self.cov_mat)
        index = np.argsort(self.eigen_values)[::-1]
        self.sorted_eigen_values = self.eigen_values[index]
        if self.d > 0 and self.d < 1:
            self.total_variance = np.sum(self.sorted_eigen_values)
            self.selected_eigen_values = []
            cum_variance = 0
            i = 0
            while cum_variance < self.d * self.total_variance:
                cum_variance += self.sorted_eigen_values[i]
                self.selected_eigen_values.append(self.sorted_eigen_values[i])
                i += 1
            self.selected_eigen_values = np.array(self.selected_eigen_values)
            self.d = len(self.selected_eigen_values)
        self.sorted_eigen_vectors = self.eigen_vectors[index]
        self.final_eigen_vectors = self.sorted_eigen_vectors[:, :self.d]

    def transform(self, X):
        X1 = X.dot(self.final_eigen_vectors)
        return X1

### KNN Classifier

In [113]:
class KNN():
    def __init__(self, k):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def knn(self, test_point):
        distances = []
        for i in range(self.X_train.shape[0]):
            train_point = self.X_train[i, :]
            dist = euclidean_distance(test_point, train_point)
            distances.append((dist, self.y_train[i]))
        distances.sort()
        return distances[:self.k]
    
    def predict(self, X_test):
        y_pred = []
        for i in range(X_test.shape[0]):
            test_point = X_test[i, :]
            k_nearest_neighbours = self.knn(test_point)
            nearest_labels = pd.DataFrame([label for _,label in k_nearest_neighbours])
            y_pred.append(nearest_labels.mode()[0][0])
        return np.array(y_pred)

### Load Data

In [114]:
dataset = pd.read_csv("gender.csv")
dataset

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,1,male,-0.066420,0.151611,0.027740,0.052771,-0.066105,-0.041232,-0.002637,-0.158467,...,0.025989,-0.001087,0.027260,-0.046754,-0.118619,-0.163774,-0.000590,-0.076400,0.107497,0.001567
1,2,male,-0.030614,0.049667,0.008084,-0.050324,0.007649,-0.063818,-0.019530,-0.119905,...,0.044229,-0.023900,-0.028108,0.040618,-0.146579,-0.141244,0.016162,0.017638,0.080610,-0.015930
2,3,male,-0.096178,0.061127,0.035326,-0.035388,-0.090728,-0.018634,-0.024315,-0.139786,...,0.111141,0.059436,-0.029222,0.042115,-0.222173,-0.116908,0.093428,0.017391,0.057652,0.086116
3,4,male,-0.103057,0.085044,0.078333,-0.035873,-0.028163,0.004924,0.007829,-0.017016,...,0.100793,-0.002644,-0.023388,0.029497,-0.139830,-0.119243,0.005306,-0.015100,0.161575,0.062462
4,5,male,-0.125815,0.120046,0.023131,-0.042901,0.038215,-0.049677,-0.054258,-0.130758,...,0.090197,0.067527,0.039926,0.047469,-0.056852,-0.076700,0.004966,0.028171,0.026041,0.084135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,796,female,-0.164731,0.064301,0.058630,-0.017420,-0.157600,-0.022536,0.002864,-0.072739,...,0.095115,0.007198,-0.004655,0.023957,-0.170753,-0.136630,0.041614,0.031600,0.019064,0.004384
796,797,female,-0.095308,0.051095,0.092913,-0.101745,-0.083153,-0.028159,0.009090,-0.114513,...,0.056078,0.119846,0.087470,0.017481,-0.096594,-0.084553,0.037709,0.030732,-0.083713,0.064970
797,798,female,-0.202852,0.037039,0.079731,-0.047156,-0.140062,-0.080246,0.057668,-0.122083,...,0.066954,0.035684,-0.023112,-0.030452,-0.154243,-0.188270,0.071086,0.037384,-0.006257,0.039977
798,799,female,-0.088300,0.063530,0.049627,-0.026011,-0.172773,0.086218,0.042710,-0.161852,...,0.039460,0.067547,0.040426,0.028007,-0.154515,-0.127736,0.046967,0.009701,-0.016942,0.048071


### Test Train Split

In [115]:
def train_test_split(dataset):
    classes = dataset.iloc[:, 1].unique()
    test = pd.DataFrame()
    train = pd.DataFrame()
    for c in classes:
        class_data = dataset[dataset.iloc[:, 1] == c]
        train = pd.concat([train, class_data.iloc[10:]])
        test = pd.concat([test, class_data.iloc[:10]])
    X_train, X_test = train.iloc[:, 2:].values, test.iloc[:, 2:].values
    y_train, y_test = train.iloc[:, 1].values, test.iloc[:, 1].values
    return X_train, X_test, y_train, y_test

In [116]:
X_train, X_test, y_train, y_test = train_test_split(dataset)

### Principal Component Analysis

In [117]:
pca = PCA(0.95)
pca.fit(X_train)

In [118]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

### KNN Classifier and Accuracy

In [119]:
knn = KNN(5)
knn.fit(X_train_pca, y_train)

In [120]:
y_pred = knn.predict(X_test_pca)

In [121]:
final_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(final_df)

    Actual Predicted
0     male      male
1     male      male
2     male      male
3     male      male
4     male      male
5     male      male
6     male      male
7     male    female
8     male      male
9     male      male
10  female      male
11  female      male
12  female    female
13  female    female
14  female    female
15  female    female
16  female    female
17  female    female
18  female    female
19  female    female


### Accuracy

In [122]:
print("Accuracy: ", accuracy(y_test, y_pred) * 100, "%")

Accuracy:  85.0 %
