In [1]:
import numpy as np
import pandas as pd
import sympy as sp

In [2]:
def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)

In [3]:
def covariance_matrix(data, mean_vector):
        z_matrix = data - mean_vector
        cov_matrix = np.dot(z_matrix.T, z_matrix) / (data.shape[0] - 1)
        return cov_matrix

### PCA

In [4]:
class PCA:
    def __init__(self, n_components=0):
        self.d = n_components

    def fit(self, X):
        if self.d == 0:
            self.d = round(0.95*len(X))
        self.mean_vector = np.mean(X, axis = 0)
        self.cov_mat = covariance_matrix(X, self.mean_vector)
        self.eigen_values, self.eigen_vectors = np.linalg.eig(self.cov_mat)
        index = np.argsort(self.eigen_values)[::-1]
        self.sorted_eigen_values = self.eigen_values[index]
        self.sorted_eigen_vectors = self.eigen_vectors[index]
        self.final_eigen_vectors = self.sorted_eigen_vectors[:, :self.d]

    def transform(self, X):
        X1 = X.dot(self.final_eigen_vectors)
        return X1

### Bayes Classifier

In [5]:
class BayesClassifier:
    def __init__(self):
        pass

    def fit(self, X_train, y_train):
        self.classes = y_train[0].unique()
        self.dimensions = len(X_train[0])
        self.apriori, self.mean_vector = {}, {}
        self.cov_mats, self.cov_dets = {}, {}
        self.inv_cov_mats = {}
        for c in self.classes:
            self.apriori[c] = len(y_train[y_train[0] == c]) / len(y_train)
            self.mean_vector[c] = np.mean(X_train[y_train[0] == c], axis=0)
            self.cov_mats[c] = covariance_matrix(np.array(X_train[y_train[0] == c]), self.mean_vector[c])
            self.cov_dets[c] = sp.Matrix(self.cov_mats[c]).det()
            self.inv_cov_mats[c] = np.linalg.inv(self.cov_mats[c])

    def predict(self, X_test):
        y_pred = []
        for X in X_test:
            probability_values = {}
            for c in self.classes:
                req_matrix = X - self.mean_vector[c]
                numerator = np.exp(-0.5 * np.dot(np.dot(req_matrix.T, self.inv_cov_mats[c]), req_matrix))
                denominator = np.power(2 * np.pi, self.dimensions / 2) * np.power(self.cov_dets[c], 0.5)
                probability_values[c] = self.apriori[c] * (numerator / denominator)
            y_pred.append(max(probability_values, key = probability_values.get))
        return np.array(y_pred)

### Load Data

In [6]:
dataset = pd.read_csv("gender.csv")
dataset

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,1,male,-0.066420,0.151611,0.027740,0.052771,-0.066105,-0.041232,-0.002637,-0.158467,...,0.025989,-0.001087,0.027260,-0.046754,-0.118619,-0.163774,-0.000590,-0.076400,0.107497,0.001567
1,2,male,-0.030614,0.049667,0.008084,-0.050324,0.007649,-0.063818,-0.019530,-0.119905,...,0.044229,-0.023900,-0.028108,0.040618,-0.146579,-0.141244,0.016162,0.017638,0.080610,-0.015930
2,3,male,-0.096178,0.061127,0.035326,-0.035388,-0.090728,-0.018634,-0.024315,-0.139786,...,0.111141,0.059436,-0.029222,0.042115,-0.222173,-0.116908,0.093428,0.017391,0.057652,0.086116
3,4,male,-0.103057,0.085044,0.078333,-0.035873,-0.028163,0.004924,0.007829,-0.017016,...,0.100793,-0.002644,-0.023388,0.029497,-0.139830,-0.119243,0.005306,-0.015100,0.161575,0.062462
4,5,male,-0.125815,0.120046,0.023131,-0.042901,0.038215,-0.049677,-0.054258,-0.130758,...,0.090197,0.067527,0.039926,0.047469,-0.056852,-0.076700,0.004966,0.028171,0.026041,0.084135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,796,female,-0.164731,0.064301,0.058630,-0.017420,-0.157600,-0.022536,0.002864,-0.072739,...,0.095115,0.007198,-0.004655,0.023957,-0.170753,-0.136630,0.041614,0.031600,0.019064,0.004384
796,797,female,-0.095308,0.051095,0.092913,-0.101745,-0.083153,-0.028159,0.009090,-0.114513,...,0.056078,0.119846,0.087470,0.017481,-0.096594,-0.084553,0.037709,0.030732,-0.083713,0.064970
797,798,female,-0.202852,0.037039,0.079731,-0.047156,-0.140062,-0.080246,0.057668,-0.122083,...,0.066954,0.035684,-0.023112,-0.030452,-0.154243,-0.188270,0.071086,0.037384,-0.006257,0.039977
798,799,female,-0.088300,0.063530,0.049627,-0.026011,-0.172773,0.086218,0.042710,-0.161852,...,0.039460,0.067547,0.040426,0.028007,-0.154515,-0.127736,0.046967,0.009701,-0.016942,0.048071


### Test Train Split

In [7]:
def train_test_split(dataset):
    classes = dataset.iloc[:, 1].unique()
    test = pd.DataFrame()
    train = pd.DataFrame()
    for c in classes:
        class_data = dataset[dataset.iloc[:, 1] == c]
        train = pd.concat([train, class_data.iloc[10:]])
        test = pd.concat([test, class_data.iloc[:10]])
    X_train, X_test = train.iloc[:, 2:].values, test.iloc[:, 2:].values
    y_train, y_test = train.iloc[:, 1].values, test.iloc[:, 1].values
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = train_test_split(dataset)

In [9]:
X_train

array([[-0.10175994,  0.09511936,  0.02239008, ...,  0.04522717,
         0.13483205,  0.0537758 ],
       [-0.12695727,  0.06544437, -0.01474994, ..., -0.02528606,
        -0.00342875,  0.05703329],
       [ 0.02178704,  0.0477692 ,  0.03115616, ..., -0.05274343,
         0.03425189,  0.04634342],
       ...,
       [-0.20285167,  0.0370395 ,  0.07973114, ...,  0.03738441,
        -0.00625749,  0.03997689],
       [-0.08829999,  0.06353012,  0.04962703, ...,  0.00970074,
        -0.01694169,  0.04807128],
       [-0.15620135,  0.05516458,  0.14271647, ..., -0.0102984 ,
        -0.02885648,  0.0753232 ]])

In [10]:
X_test

array([[-0.06641996,  0.15161145,  0.02773961, ..., -0.07640016,
         0.10749723,  0.00156654],
       [-0.03061386,  0.04966652,  0.00808374, ...,  0.0176384 ,
         0.08060966, -0.01592966],
       [-0.09617768,  0.06112669,  0.03532604, ...,  0.01739147,
         0.057652  ,  0.08611634],
       ...,
       [-0.1029727 ,  0.046464  ,  0.01968378, ..., -0.08885815,
         0.04931188,  0.01900873],
       [-0.13482405,  0.0933139 ,  0.10350525, ..., -0.1021332 ,
         0.01416106,  0.0113144 ],
       [-0.08694977,  0.1049448 ,  0.09312473, ..., -0.0812363 ,
         0.0733347 ,  0.05688613]])

In [11]:
y_train

array(['male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
      

In [12]:
y_test

array(['male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'female', 'female', 'female', 'female', 'female',
       'female', 'female', 'female', 'female', 'female'], dtype=object)

### Encoding Classes

In [13]:
y1_train = pd.DataFrame([1 if i == 'male' else 0 for i in y_train])
y1_test = pd.DataFrame([1 if i == 'male' else 0 for i in y_test])

In [14]:
y1_train

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
775,0
776,0
777,0
778,0


In [15]:
y1_test

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


### Before PCA

In [16]:
bc = BayesClassifier()
bc.fit(X_train, y1_train)

In [17]:
y_pred_1 = bc.predict(X_test)

In [18]:
final_df = pd.DataFrame({'Actual': y1_test[0], 'Predicted': y_pred_1})
final_df

Unnamed: 0,Actual,Predicted
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,0
8,1,1
9,1,1


In [19]:
print("Accuracy: ", accuracy(y1_test[0], y_pred) * 100, "%")

NameError: name 'y_pred' is not defined

### After PCA

In [None]:
pca = PCA()
pca.fit(X_train)

In [None]:
X_train_1 = pca.transform(X_train)

In [None]:
X_train_1

array([[-0.11812557,  0.11206056,  0.17354654, ...,  0.03422985,
        -0.12488713, -0.03848919],
       [-0.11831062,  0.10939732, -0.07867839, ...,  0.11657668,
        -0.04472894,  0.03018195],
       [-0.14082579,  0.29901296, -0.06524816, ...,  0.12879656,
        -0.05609639, -0.01706463],
       ...,
       [-0.44902883,  0.27109374,  0.01885488, ...,  0.17658329,
        -0.06398956,  0.0711844 ],
       [-0.41722372,  0.3484417 ,  0.01251727, ...,  0.15111654,
        -0.03854156,  0.00174527],
       [-0.44083786,  0.03686637, -0.01286635, ...,  0.10854336,
        -0.12479291,  0.06558614]])

In [None]:
X_test = pca.transform(X_test)
X_test_1

array([[-1.08610059e-01,  8.63233365e-02,  7.16832692e-02, ...,
        -8.20074347e-03, -1.60084183e-02, -1.50908739e-02],
       [-1.11054360e-01,  5.57420185e-02, -4.35212556e-02, ...,
        -6.64650349e-02,  3.06979245e-02, -8.30894943e-03],
       [-1.09418985e-01,  6.51051366e-02,  4.13778438e-02, ...,
        -5.51208648e-02, -3.88803532e-02, -1.15194261e-01],
       ...,
       [-4.73494828e-01,  3.72661775e-02,  1.62410315e-01, ...,
        -1.43670407e-02,  6.03796592e-02, -7.75930476e-02],
       [-3.90765189e-01,  2.31719551e-01,  4.72429367e-02, ...,
        -9.73414654e-02,  4.25723692e-02,  1.01343265e-02],
       [-3.76657635e-01,  1.19616704e-01,  1.55549463e-01, ...,
         2.83488103e-03,  1.01457812e-01, -8.91925310e-05]])

In [None]:
bc = BayesClassifier()
bc.fit(X_train_1, y1_train)

In [None]:
y_pred_2 = bc.predict(X_test)

In [None]:
final_df = pd.DataFrame({'Actual': y1_test[0], 'Predicted': y_pred_2})
final_df

Unnamed: 0,Actual,Predicted
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,0
8,1,1
9,1,1


In [None]:
print("Accuracy: ", accuracy(y1_test[0], y_pred_2) * 100, "%")

Accuracy:  85.0 %
