In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import scipy.io


In [63]:
mat_test = scipy.io.loadmat('..//data//data_test.mat')
mat_train = scipy.io.loadmat('..//data//data_train.mat')
mat_label_train = scipy.io.loadmat('..//data//label_train.mat')

print(mat_train.keys())
print(mat_test.keys())
print(mat_label_train.keys())

dict_keys(['__header__', '__version__', '__globals__', 'data_train'])
dict_keys(['__header__', '__version__', '__globals__', 'data_test'])
dict_keys(['__header__', '__version__', '__globals__', 'label_train'])


In [64]:
data_test = mat_test['data_test']
data_train = mat_train['data_train']
label_train = mat_label_train['label_train']

In [65]:
print(type(data_train))  # Should print <class 'numpy.ndarray'>
print(data_train.shape)  # Prints the shape of the dataset
print(type(label_train))  # Should print <class 'numpy.ndarray'>
print(label_train.shape)  # Prints the shape of the labels
print(type(data_test))  # Should print <class 'numpy.ndarray'>
print(data_test.shape)  # Prints the shape of the dataset

<class 'numpy.ndarray'>
(577, 5)
<class 'numpy.ndarray'>
(577, 1)
<class 'numpy.ndarray'>
(23, 5)


In [66]:
# Convert the variables to Pandas DataFrames
df_data_train = pd.DataFrame(data_train)
df_data_test = pd.DataFrame(data_test)
df_label_train = pd.DataFrame(label_train)

In [67]:
df_label_train.to_csv('..//data//label_train.csv', index=False)

In [68]:
print(df_label_train.head())
print(df_label_train.describe())
print(df_label_train[0].unique())

   0
0  1
1  1
2  1
3  1
4  1
                0
count  577.000000
mean     0.043328
std      0.999928
min     -1.000000
25%     -1.000000
50%      1.000000
75%      1.000000
max      1.000000
[ 1 -1]


In [69]:
df_data_train.to_csv('..//data//data_train.csv', index=False)

In [70]:
df_train = df_data_train.copy()
df_train['label'] = df_label_train

In [72]:
classes = df_train['label'].unique()
print(classes)

[ 1 -1]


In [73]:
#Separate the classes
X1 = df_train[df_train['label'] == classes[0]].drop(columns='label').values
X2 = df_train[df_train['label'] == classes[1]].drop(columns='label').values

In [74]:
print(X1.shape)
print(X2.shape)
print(X1)
print(X2)

(301, 5)
(276, 5)
[[ 0.37924377  0.98937812 -2.10994043 -0.22980496 -0.92707999]
 [-2.25289823 -0.68884118 -0.79939513 -0.57921684 -0.16987767]
 [-1.1683507  -0.85683761  0.56697509  0.48048118 -0.47167249]
 ...
 [ 0.50132753 -0.56032849  0.54934915  0.01788818  0.05493268]
 [ 0.54031375 -1.22646737  0.46763432  0.65453522  1.39855244]
 [ 0.99075099  0.79295036  0.191457    1.25769877 -1.77564838]]
[[ 1.09055692  1.04846924  0.28444994  2.22921146  1.97506608]
 [ 0.77668448  3.46785973  1.52931945 -0.24349783  0.77325587]
 [-0.26892863  2.91898541  1.03554031  1.87105592  0.97823819]
 ...
 [-0.0724326   2.00792589  1.67974785  1.25719446  3.13608303]
 [ 2.87489219  2.06900942  2.72637984  3.25644852  0.16496699]
 [ 0.87026556 -0.07002803  1.50605951  0.02779781  3.45284453]]


In [75]:
#Calculate the mean vector for each class
mean1 = np.mean(X1, axis=0)
mean2 = np.mean(X2, axis=0)
print(mean1)
print(mean2)

[-0.04179197  0.06709806 -0.05396359  0.09123305 -0.01469081]
[1.04278079 2.06229576 0.94075223 1.25511355 1.60467669]


In [76]:
X1_centered = X1 - mean1
X2_centered = X2 - mean2

#Compute the Scatter Martix
S1 = X1_centered.T @ X1_centered
S2 = X2_centered.T @ X2_centered

#Compute the within-class scatter matrix
Sw = S1 + S2
print(Sw)

[[ 5.68106935e+02  1.69888788e+01 -1.25504726e-01  2.11181122e+01
  -9.16624869e+00]
 [ 1.69888788e+01  5.67736243e+02 -1.74383674e+01  2.87697449e+01
  -8.01098501e+00]
 [-1.25504726e-01 -1.74383674e+01  5.40029844e+02  1.58065228e+01
  -8.42147620e+00]
 [ 2.11181122e+01  2.87697449e+01  1.58065228e+01  6.04840878e+02
  -8.35601990e+00]
 [-9.16624869e+00 -8.01098501e+00 -8.42147620e+00 -8.35601990e+00
   6.37307785e+02]]


In [77]:
#compute weight vector
w = np.linalg.inv(Sw) @ (mean1 - mean2)
print(w)

#compute bias term 
w0 = -0.5 * (mean1 + mean2) @ w
print(w0)

[-0.00178602 -0.0034729  -0.00194673 -0.00168257 -0.00265808]
0.008700458474367304


In [78]:
#create the predict function 
def predict(x):
    val = np.dot(x, w) + w0
    return classes[0] if val > 0 else classes[1]

In [79]:
data_test = df_data_test.copy()
data_test['prediction'] = data_test.apply(lambda row: predict(row.values), axis=1)
data_test.to_csv('..//data//data_test.csv', index=False)

In [None]:
#print the 23 predictions
print(data_test.head(23))

           0         1         2         3         4  prediction
0   0.005780  1.562600  0.447382  2.090916  0.897334          -1
1   1.229060  1.439845  0.877565 -0.119081  1.468714          -1
2   2.339889  0.968273 -0.342265  2.123420  0.848204          -1
3   0.232466  1.968615  0.359025  1.583467  0.803537          -1
4   0.458142  2.232859 -0.622579  1.563475  0.724611          -1
5   0.696818  2.393233 -1.181912  0.832575  1.532150          -1
6  -0.173921  1.709731  0.616747  1.233261  1.334690          -1
7   1.171005  1.299816  0.184931  1.827980  0.785178          -1
8   2.064855  0.830328  0.279357  1.601429  1.032416          -1
9   0.835157  0.929826 -0.577110  0.649153  0.182625           1
10  0.494841  1.605039  0.415776  0.846501 -1.551331           1
11  1.533796  1.066631 -1.991196  1.037367  0.334273           1
12 -0.478865  0.371465  0.941137  0.397740  0.892793           1
13 -0.632688 -0.622585 -0.491818  1.441855  2.314403           1
14 -1.045710  1.920303 -0