## CIFA-10 Dataset Classification using Linear Discriminant Analysis (LDA)

### Summary

In [1]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from utils import Info
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

c:\Users\LAB\Anaconda3\envs\lab\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\LAB\Anaconda3\envs\lab\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
c:\Users\LAB\Anaconda3\envs\lab\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


In [2]:
class Config(Info):
    def __init__(self):
        super(Info, self).__init__()
        self.device = 'PC'
        self.dataset = 'CIFAR-10'
        self.test_size = 0.2
        self.feature_size = 3072
        self.method = 'LDA'
        self.reduction_method = [None, None] # method, n_components
        self.reduction_ratio = None
        self.iter = 10

In [3]:
cig = Config()
cig.info()

Device ── PC
│
├──Dataset
│    └────CIFAR-10
│    └────Train size 80%
│    └────Feature size: 3072
│
├──Method
│    └────LDA
│
├──Dimension reduction
│    └────Method: None
│    └────Component size: None
│    └────Feature Reduction Ratio: None
│
└──Iteration
    └────10


## Load CIFA-10 Dataset

In [4]:
transform = transforms.Compose([transforms.ToTensor()])

batch_size = 4

trainset = datasets.CIFAR10(root='../data', train=True, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = datasets.CIFAR10(root='../data', train=False, download=True, transform=transform)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

print(trainset.data.shape)
print(testset.data.shape)

Files already downloaded and verified
Files already downloaded and verified
(50000, 32, 32, 3)
(10000, 32, 32, 3)


In [5]:
features = trainset.data.reshape(-1, cig.feature_size)
target = trainset.targets

features = features / 255.

print(np.array(features).shape)
print(np.array(target).shape)

(50000, 3072)
(50000,)


In [6]:
comparison_acc = []
comparison_lda_time = []
comparison_time = []

## Linear Discriminant Analysis

In [7]:
avg_acc = []
avg_time = []

max_seed = cig.iter

for seed in range(max_seed):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=cig.test_size, random_state=seed, shuffle=True)

    lda = LinearDiscriminantAnalysis()  # default = svd
    lda.fit(x_train, y_train)

    start = time.perf_counter()
    test_score = lda.score(x_test, y_test)
    end = time.perf_counter() - start

    avg_acc.append(test_score)
    avg_time.append(end)

mean_acc = np.array(avg_acc).mean()
mean_time = np.array(avg_time).mean() / len(y_test) * 1e6

comparison_acc.append(mean_acc)
comparison_time.append(mean_time)

cig.print_rutin()
print("-----" * 8)
print("Test set score: %.4f" % mean_acc)
print("All Test dataset Prediction Average Time at once : %.4f" % (mean_time*len(y_test)), "sec")
print("Divide the Prediction Time by Test size : %.4f" % mean_time, "ms")

PC - CIFAR-10(80%) - LDA - 10 iteration
----------------------------------------
Test set score: 0.3608
All Test dataset Prediction Average Time at once : 52226.4600 sec
Divide the Prediction Time by Test size : 5.2226 ms


## LDA with Dimensijon Reduction (Feature 9)

In [8]:
cig.reduction_method = ['LDA', 9]
cig.info()

Device ── PC
│
├──Dataset
│    └────CIFAR-10
│    └────Train size 80%
│    └────Feature size: 3072
│
├──Method
│    └────LDA
│
├──Dimension reduction
│    └────Method: LDA
│    └────Component size: 9
│    └────Feature Reduction Ratio: 0.3%
│
└──Iteration
    └────10


In [14]:
from sklearn.utils.multiclass import unique_labels

avg_acc = []
avg_lda_time = []
avg_train_project_time = []
avg_test_project_time = []
avg_time = []

max_seed = cig.iter

for seed in range(max_seed):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=cig.test_size, random_state=seed, shuffle=True)

    lda = LinearDiscriminantAnalysis(n_components=cig.reduction_method[1])  # default = svd

    lda_start = time.perf_counter()
    lda.fit_transform(x_train, y_train)
    lda_end = time.perf_counter() - lda_start
    avg_lda_time.append(lda_end)

    # lda_p_start = time.perf_counter()
    # x_train = lda.transform(x_train)
    # lda_p_end = time.perf_counter() - lda_p_start
    # avg_train_project_time.append(lda_p_end)

    lda_p_start = time.perf_counter()
    x_test = lda.transform(x_test)
    lda_p_end = time.perf_counter() - lda_p_start
    avg_test_project_time.append(lda_p_end)

    start = time.perf_counter()
    test_score = lda.score(x_test, y_test)
    end = time.perf_counter() - start

    avg_acc.append(test_score)
    avg_time.append(end)

mean_lda_time = np.array(avg_lda_time).mean()
mean_train_project = np.array(avg_train_project_time).mean()
mean_test_project = np.array(avg_test_project_time).mean() / len(y_test) * 1e6

mean_acc = np.array(avg_acc).mean()
mean_time = np.array(avg_time).mean() / len(y_test) * 1e6

comparison_acc.append(mean_acc)
comparison_lda_time.append(mean_test_project)
comparison_time.append(mean_time)

cig.print_rutin()
print("-----" * 8)
print("The number of components : %d(%f%)" % cig.reduction_method[1], cig.reduction_ratio)
print("Calculating Train dataset U*S*Vt Matrix Time : %.4f" % mean_lda_time, "sec")
print("Calculating Train dataset Projection Time : %.4f" % mean_train_project, "sec")
print("All Test dataset Projection Time : %.4f" % (mean_test_project*len(y_test)), "sec")
print("Divide the Projection Time by Test size : %f" % mean_test_project, "ms")
print("-----" * 8)
print("Test set score: %f" % mean_acc)
print("All Test dataset Prediction Average Time at once : %.4f" % (mean_time*len(y_test)), "sec")
print("Divide the Prediction Time by Test size : %.4f" % mean_time, "ms")

ValueError: X has 2 features, but LinearDiscriminantAnalysis is expecting 3072 features as input.

## kNN with LDA Dimensijon Reduction (Feature 9)

In [15]:
cig.reduction_method = ['LDA', 9]
cig.info()

Device ── PC
│
├──Dataset
│    └────CIFAR-10
│    └────Train size 80%
│    └────Feature size: 3072
│
├──Method
│    └────LDA
│
├──Dimension reduction
│    └────Method: LDA
│    └────Component size: 9
│    └────Feature Reduction Ratio: 0.3%
│
└──Iteration
    └────10


In [17]:
from sklearn.utils.multiclass import unique_labels

avg_acc = []
avg_lda_time = []
avg_train_project_time = []
avg_test_project_time = []
avg_time = []

max_seed = cig.iter

for seed in range(max_seed):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=cig.test_size, random_state=seed, shuffle=True)

    lda = LinearDiscriminantAnalysis(n_components=cig.reduction_method[1])  # default = svd

    lda_start = time.perf_counter()
    lda.fit(x_train, y_train)
    lda_end = time.perf_counter() - lda_start
    avg_lda_time.append(lda_end)

    lda_p_start = time.perf_counter()
    x_train = lda.transform(x_train)
    lda_p_end = time.perf_counter() - lda_p_start
    avg_train_project_time.append(lda_p_end)

    knn = KNeighborsClassifier(1, weights='distance', n_jobs=-1)
    knn.fit(x_train, y_train)

    lda_p_start = time.perf_counter()
    x_test = lda.transform(x_test)
    lda_p_end = time.perf_counter() - lda_p_start
    avg_test_project_time.append(lda_p_end)

    start = time.perf_counter()
    test_score = knn.score(x_test, y_test)
    end = time.perf_counter() - start

    avg_acc.append(test_score)
    avg_time.append(end)


mean_lda_time = np.array(avg_lda_time).mean()
mean_train_project = np.array(avg_train_project_time).mean()
mean_test_project = np.array(avg_test_project_time).mean() / len(y_test) * 1e6

mean_acc = np.array(avg_acc).mean()
mean_time = np.array(avg_time).mean() / len(y_test) * 1e6

comparison_acc.append(mean_acc)
comparison_lda_time.append(mean_test_project)
comparison_time.append(mean_time)

cig.print_rutin()
print("-----" * 8)
print("The number of components : %d(%f%)" % cig.reduction_method[1], cig.reduction_ratio)
print("Calculating Train dataset U*S*Vt Matrix Time : %.4f" % mean_lda_time, "sec")
print("Calculating Train dataset Projection Time : %.4f" % mean_train_project, "sec")
print("All Test dataset Projection Time : %.4f" % (mean_test_project*len(y_test)), "sec")
print("Divide the Projection Time by Test size : %f" % mean_test_project, "ms")
print("-----" * 8)
print("Test set score: %f" % mean_acc)
print("All Test dataset Prediction Average Time at once : %.4f" % (mean_time*len(y_test)), "sec")
print("Divide the Prediction Time by Test size : %.4f" % mean_time, "ms")

PC - CIFAR-10(80%) - LDA - 10 iteration - LDA(reduction : 9%)
----------------------------------------


TypeError: not enough arguments for format string