In [1]:
import sklearn
import numpy as np
import numpy.linalg as linalg
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
data = load_digits()['images']
targets = load_digits()['target']

In [3]:
data = np.asarray(data)
img_train, img_test, class_train, class_test = train_test_split(data, targets, test_size=0.20, random_state=4598, shuffle=True)

In [4]:
grouped_by_target = [[] for i in range(10)]
for i in range(len(img_train)):
  grouped_by_target[class_train[i]].append(img_train[i])

In [5]:
for arr in grouped_by_target:
  print(len(arr))

147
140
139
156
149
138
138
146
141
143


In [6]:
svd_bases = [[] for i in range(10)]
num_of_basis_to_keep = 5
for i in range(10):
  matrix = []
  for img in grouped_by_target[i]:
    matrix.append(img.reshape(64))
  matrix = np.asarray(matrix).T
  u, s, v_T = linalg.svd(matrix)
  # print(u.shape)
  # print(s)
  # print(v.shape)
  # s_mat = np.zeros((u.shape[1], v_T.shape[0]))
  # for i in range(len(s)):
  #   s_mat[i][i] = s[i]
  # print(u @ s_mat @ v_T - matrix)
  for basis_index in range(num_of_basis_to_keep):
    basis_elm = u[:, basis_index]
    basis_elm = basis_elm / linalg.norm(basis_elm)
    svd_bases[i].append(basis_elm)

In [7]:
def determine_dist(vec, ortho_normal_basis):
  proj = np.zeros(vec.shape[0])
  for elm in ortho_normal_basis:
    proj += np.dot(elm, vec) * elm
  # print(proj) 
  return linalg.norm(vec - proj)

dict_for_df = {str(i) + ' distance':[] for i in range(10)}
dict_for_df['true classification'] = []
dict_for_df['SVD classification'] = []


for img, true_class in zip(img_test, class_test):
  dict_for_df['true classification'].append(true_class)
  least = (np.inf, -1)
  for i in range(10):
    dist = (determine_dist(img.reshape(64), svd_bases[i]))
    dict_for_df[str(i)+' distance'].append(dist)
    # print(i, dist)
    least = min(least, (dist, i))
  dict_for_df['SVD classification'].append(least[1])
testing_df = pd.DataFrame.from_dict(dict_for_df)
testing_df.head()

Unnamed: 0,0 distance,1 distance,2 distance,3 distance,4 distance,5 distance,6 distance,7 distance,8 distance,9 distance,true classification,SVD classification
0,41.633678,37.718529,14.336049,32.15352,40.100416,36.965263,43.892855,34.893925,31.73926,36.374252,2,2
1,39.583299,36.904353,40.867171,33.966195,37.058024,19.707969,30.109091,42.923413,33.481328,31.19575,5,5
2,40.347282,33.390447,35.322445,32.180857,32.542393,35.007614,41.479536,20.091501,31.100718,34.455833,7,7
3,25.557353,32.462693,38.573624,36.19916,31.920102,34.29955,18.141238,41.461843,35.063403,33.593941,6,6
4,11.963808,36.768529,37.107295,39.066143,38.816234,34.839706,33.233199,46.460011,36.648558,35.800706,0,0


In [8]:
count = (testing_df['true classification'] != testing_df['SVD classification']).sum()
print(count, 'misclassifications out of', len(testing_df))

12 misclassifications out of 360
