## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import sympy as sp

## Defined Classes and Functions

In [2]:
class PCA:
    def __init__(self, n_components=0):
        self.d1 = n_components

    def fit(self, X):
        if self.d1 == 0:
            self.d1 = round(0.95*len(X))
        self.mean_vector = np.mean(X, axis = 0)
        self.covariance_matrix = np.cov(X, rowvar=False)
        self.eigen_values, self.eigen_vectors = np.linalg.eig(self.covariance_matrix)
        ind = np.argsort(self.eigen_values)[::-1]
        self.sorted_eigen_values = self.eigen_values[ind]
        self.sorted_eigen_vectors = self.eigen_vectors[ind]
        self.final_eigen_vectors = self.sorted_eigen_vectors[:, :self.d1]

    def transform(self, X):
        X1 = np.dot(X, self.final_eigen_vectors)
        return X1

### The Classification I used for checking is Bayes Classification

In [3]:
class BayesClassifier:
    def __init__(self):
        pass

    def fit(self, X_train, y_train):
        self.classes, self.class_counts = np.unique(y_train, return_counts=True)
        self.no_of_classes = len(self.classes)
        self.total_data_points = len(y_train)
        self.apriori_probabilities = self.class_counts/self.total_data_points
        self.class_split_training_data = {}
        for c in self.classes:
            self.class_split_training_data[c] = []
            for j in range(len(y_train)):
                if c == y_train[j]:
                    self.class_split_training_data[c].append(X_train[j])
            self.class_split_training_data[c] = np.array(self.class_split_training_data[c])
        self.mean_vectors = []
        for c in self.classes:
            self.mean_vectors.append(np.mean(self.class_split_training_data[c], axis = 0))
        self.covariance_matrices = []
        for c in self.classes:
            self.covariance_matrices.append(np.cov(self.class_split_training_data[c], rowvar=False))
        self.inverse_covariance_matrices = []
        for c in range(len(self.classes)):
            self.inverse_covariance_matrices.append(np.linalg.inv(self.covariance_matrices[c]))
        self.dimensions = len(X_train[0])
        self.det_covariance_matrices = []
        for c in range(len(self.classes)):
            self.det_covariance_matrices.append(sp.Matrix(self.covariance_matrices[c]).det())

    def predict(self, X_test):
        y_pred = []
        for X in X_test:
            probability_values = {}
            class_count = 0
            for c in self.classes:
                p_xw = np.exp(-0.5 * np.dot(np.dot((X - self.mean_vectors[class_count]).T, self.inverse_covariance_matrices[class_count]), (X - self.mean_vectors[class_count]))) / (((2 * np.pi) ** (self.dimensions / 2)) * np.power(self.det_covariance_matrices[c], 0.5))
                p_wx = p_xw * self.apriori_probabilities[class_count]
                probability_values[c] = p_wx
                class_count += 1
            y_pred.append(max(probability_values, key=probability_values.get))
        y_pred = np.array(y_pred)
        return y_pred

## Importing Dataset

In [14]:
dataset = pd.read_csv("gender.csv")
dataset

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,1,male,-0.066420,0.151611,0.027740,0.052771,-0.066105,-0.041232,-0.002637,-0.158467,...,0.025989,-0.001087,0.027260,-0.046754,-0.118619,-0.163774,-0.000590,-0.076400,0.107497,0.001567
1,2,male,-0.030614,0.049667,0.008084,-0.050324,0.007649,-0.063818,-0.019530,-0.119905,...,0.044229,-0.023900,-0.028108,0.040618,-0.146579,-0.141244,0.016162,0.017638,0.080610,-0.015930
2,3,male,-0.096178,0.061127,0.035326,-0.035388,-0.090728,-0.018634,-0.024315,-0.139786,...,0.111141,0.059436,-0.029222,0.042115,-0.222173,-0.116908,0.093428,0.017391,0.057652,0.086116
3,4,male,-0.103057,0.085044,0.078333,-0.035873,-0.028163,0.004924,0.007829,-0.017016,...,0.100793,-0.002644,-0.023388,0.029497,-0.139830,-0.119243,0.005306,-0.015100,0.161575,0.062462
4,5,male,-0.125815,0.120046,0.023131,-0.042901,0.038215,-0.049677,-0.054258,-0.130758,...,0.090197,0.067527,0.039926,0.047469,-0.056852,-0.076700,0.004966,0.028171,0.026041,0.084135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,796,female,-0.164731,0.064301,0.058630,-0.017420,-0.157600,-0.022536,0.002864,-0.072739,...,0.095115,0.007198,-0.004655,0.023957,-0.170753,-0.136630,0.041614,0.031600,0.019064,0.004384
796,797,female,-0.095308,0.051095,0.092913,-0.101745,-0.083153,-0.028159,0.009090,-0.114513,...,0.056078,0.119846,0.087470,0.017481,-0.096594,-0.084553,0.037709,0.030732,-0.083713,0.064970
797,798,female,-0.202852,0.037039,0.079731,-0.047156,-0.140062,-0.080246,0.057668,-0.122083,...,0.066954,0.035684,-0.023112,-0.030452,-0.154243,-0.188270,0.071086,0.037384,-0.006257,0.039977
798,799,female,-0.088300,0.063530,0.049627,-0.026011,-0.172773,0.086218,0.042710,-0.161852,...,0.039460,0.067547,0.040426,0.028007,-0.154515,-0.127736,0.046967,0.009701,-0.016942,0.048071


## Test-Train Split

In [15]:
types = dataset.iloc[:, 1].unique()
test_df = pd.DataFrame()
train_df = pd.DataFrame()
for t in types:
    type_df = dataset[dataset.iloc[:, 1] == t]
    train_df = pd.concat([train_df, type_df.iloc[10:]])
    test_df = pd.concat([test_df, type_df.iloc[:10]])

In [16]:
X_train = train_df.iloc[:, 2:].values
X_test = test_df.iloc[:, 2:].values
y_train = train_df.iloc[:, 1].values
y_test = test_df.iloc[:, 1].values

In [17]:
print(X_train.shape)

(780, 128)


In [18]:
print(X_test.shape)

(20, 128)


In [19]:
print(y_train.shape)

(780,)


In [20]:
print(y_test)

['male' 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'male'
 'female' 'female' 'female' 'female' 'female' 'female' 'female' 'female'
 'female' 'female']


## Encoding Dependent Variable

In [22]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
print("y-test :",y_test)
print("y-train :",y_train)

y-test : [1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0]
y-train : [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

## Before PCA

In [232]:
bc = BayesClassifier()
bc.fit(X_train, y_train)

In [233]:
y_pred_before = bc.predict(X_test)

In [234]:
final_df_before = pd.DataFrame({"Actual": y_test, "Predicted": y_pred_before})
final_df_before

Unnamed: 0,Actual,Predicted
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,0
8,1,1
9,1,1


In [235]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred_before)
print(f"The Accuracy of this model is {accuracy*100}%")

The Accuracy of this model is 85.0%


## After PCA

In [236]:
pca = PCA()
pca.fit(X_train)

In [237]:
X_train = pca.transform(X_train)

In [238]:
X_train

array([[-1.38689026e-01,  1.14681215e-01,  1.97865498e-01, ...,
         1.23095188e-02, -9.89974993e-02,  1.57661843e-02],
       [-1.24415272e-01,  1.45949092e-01, -2.32155635e-04, ...,
         6.49741552e-02, -8.42331865e-02, -2.71605053e-02],
       [-1.18578741e-01,  2.89203706e-01, -1.19667837e-03, ...,
         1.09359351e-04, -6.56010804e-02, -3.29677962e-02],
       ...,
       [-4.18633075e-01,  2.35723714e-01,  4.32502059e-02, ...,
         1.08523556e-01, -5.66566182e-02, -9.52950966e-02],
       [-3.96593493e-01,  3.67760998e-01,  8.36882332e-02, ...,
         2.51239746e-02, -9.85750961e-02, -6.36760857e-02],
       [-4.09019305e-01,  7.41508877e-02,  1.63472845e-02, ...,
         6.04061814e-02, -5.63152133e-02, -1.10747364e-01]])

In [239]:
X_test = pca.transform(X_test)

In [240]:
X_test

array([[-0.10868886,  0.13282677,  0.04207608, ...,  0.01188005,
        -0.07689616, -0.06241607],
       [-0.10421986,  0.09721415, -0.07660516, ...,  0.05428965,
        -0.0839531 , -0.09280539],
       [-0.09098331,  0.10429021,  0.05737186, ...,  0.06249362,
        -0.00916363,  0.0373829 ],
       ...,
       [-0.46130381,  0.10282308,  0.16251835, ...,  0.02912312,
        -0.12385808,  0.00442813],
       [-0.37771255,  0.26660279,  0.00659197, ...,  0.07874259,
        -0.06290559, -0.11258269],
       [-0.39931161,  0.16671217,  0.15763826, ...,  0.0380619 ,
        -0.16741477, -0.06462711]])

In [241]:
bc = BayesClassifier()
bc.fit(X_train, y_train)

In [242]:
y_pred_after = bc.predict(X_test)

In [243]:
final_df_after = pd.DataFrame({"Actual": y_test, "Predicted": y_pred_after})
final_df_after

Unnamed: 0,Actual,Predicted
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,0
8,1,1
9,1,1


In [244]:
accuracy = accuracy_score(y_test, y_pred_after)
print(f"The Accuracy of this model is {accuracy*100}%")

The Accuracy of this model is 85.0%
