## CIFA-10 Dataset Classification using Linear Discriminant Analysis (LDA)

### Summary

In [12]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from utils.util import Info
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

In [13]:
class Config(Info):
    def __init__(self):
        super(Info, self).__init__()
        self.device = 'PC'
        self.dataset = 'CIFAR-10'
        self.test_size = 0.2
        self.feature_size = 3072
        self.method = 'LDA'
        self.reduction_method = [None, None] # method, n_components
        self.reduction_ratio = None
        self.iter = 10

In [14]:
cig = Config()
cig.info()

Device ── PC
│
├──Dataset
│    └────CIFAR-10
│    └────Train size 80%
│    └────Feature size: 3072
│
├──Method
│    └────LDA
│
├──Dimension reduction
│    └────Method: None
│    └────Component size: None
│    └────Feature Reduction Ratio: None%
│
└──Iteration
    └────10


## Load CIFA-10 Dataset

In [15]:
transform = transforms.Compose([transforms.ToTensor()])

batch_size = 4

trainset = datasets.CIFAR10(root='../data', train=True, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = datasets.CIFAR10(root='../data', train=False, download=True, transform=transform)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

print(trainset.data.shape)
print(testset.data.shape)

Files already downloaded and verified
Files already downloaded and verified
(50000, 32, 32, 3)
(10000, 32, 32, 3)


In [16]:
features = trainset.data.reshape(-1, cig.feature_size)
target = trainset.targets

features = features / 255.

print(np.array(features).shape)
print(np.array(target).shape)

(50000, 3072)
(50000,)


In [17]:
comparison_acc = []
comparison_lda_time = []
comparison_time = []

## Linear Discriminant Analysis

In [12]:
avg_acc = []
avg_time = []

max_seed = cig.iter

for seed in range(max_seed):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=cig.test_size, random_state=seed, shuffle=True)

    lda = LinearDiscriminantAnalysis()  # default = svd
    lda.fit(x_train, y_train)

    start = time.perf_counter()
    test_score = lda.score(x_test, y_test)
    end = time.perf_counter() - start

    avg_acc.append(test_score)
    avg_time.append(end)

mean_acc = np.array(avg_acc).mean()
mean_time = np.array(avg_time).mean() / len(y_test) * 1e6

comparison_acc.append(mean_acc)
comparison_time.append(mean_time)

cig.print_rutin()
print("-----" * 8)
print("Test set score: %.4f" % mean_acc)
print("All Test dataset Prediction Average Time at once : %.4f" % (mean_time*len(y_test)), "sec")
print("Divide the Prediction Time by Test size : %.4f" % mean_time, "ms")

PC - CIFAR-10(80%) - LDA - 10 iteration
----------------------------------------
Test set score: 0.3608
All Test dataset Prediction Average Time at once : 64864.9300 sec
Divide the Prediction Time by Test size : 6.4865 ms


## LDA with Dimension Reduction (Feature 2,304) (75%)

In [26]:
cig.reduction_method = ['PCA', 2304]
cig.info()

Device ── PC
│
├──Dataset
│    └────CIFAR-10
│    └────Train size 80%
│    └────Feature size: 3072
│
├──Method
│    └────LDA
│
├──Dimension reduction
│    └────Method: PCA
│    └────Component size: 2304
│    └────Feature Reduction Ratio: 75.0%
│
└──Iteration
    └────10


In [28]:
from sklearn.utils.multiclass import unique_labels

avg_acc = []
avg_pca_time = []
avg_train_project_time = []
avg_test_project_time = []
avg_time = []

max_seed = cig.iter

for seed in range(max_seed):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=cig.test_size, random_state=seed, shuffle=True)

    lda = LinearDiscriminantAnalysis()  # default = svd

    pca = PCA(n_components=cig.reduction_method[1], random_state=seed)
    pca_start = time.perf_counter()
    pca.fit(x_train)
    pca_end = time.perf_counter() - pca_start
    avg_pca_time.append(pca_end)

    pca_p_start = time.perf_counter()
    x_train = pca.transform(x_train)
    pca_p_end = time.perf_counter() - pca_p_start
    avg_train_project_time.append(pca_p_end)

    lda.fit(x_train, y_train)

    pca_p_start = time.perf_counter()
    x_test = pca.transform(x_test)
    pca_p_end = time.perf_counter() - pca_p_start
    avg_test_project_time.append(pca_p_end)

    start = time.perf_counter()
    test_score = lda.score(x_test, y_test)
    end = time.perf_counter() - start

    avg_acc.append(test_score)
    avg_time.append(end)

mean_lda_time = np.array(avg_pca_time).mean()
mean_train_project = np.array(avg_train_project_time).mean()
mean_test_project = np.array(avg_test_project_time).mean() / len(y_test) * 1e6

mean_acc = np.array(avg_acc).mean()
mean_time = np.array(avg_time).mean() / len(y_test) * 1e6

comparison_acc.append(mean_acc)
comparison_lda_time.append(mean_test_project)
comparison_time.append(mean_time)

cig.print_rutin()
print("-----" * 8)
# print("The number of components : %d(%f%)" % cig.reduction_method[1], cig.reduction_ratio)
print("Calculating Train dataset U*S*Vt Matrix Time : %.4f" % mean_lda_time, "sec")
print("Calculating Train dataset Projection Time : %.4f" % mean_train_project, "sec")
print("All Test dataset Projection Time : %.4f" % (mean_test_project*len(y_test)), "sec")
print("Divide the Projection Time by Test size : %f" % mean_test_project, "ms")
print("-----" * 8)
print("Test set score: %f" % mean_acc)
print("All Test dataset Prediction Average Time at once : %.4f" % (mean_time*len(y_test)), "sec")
print("Divide the Prediction Time by Test size : %.4f" % mean_time, "ms")

PC - CIFAR-10(80%) - LDA - 10 iteration - PCA(feature 2304)
----------------------------------------
Calculating Train dataset U*S*Vt Matrix Time : 66.9630 sec
Calculating Train dataset Projection Time : 3.4375 sec
All Test dataset Projection Time : 785543.0400 sec
Divide the Projection Time by Test size : 78.554304 ms
----------------------------------------
Test set score: 0.375050
All Test dataset Prediction Average Time at once : 53000.9900 sec
Divide the Prediction Time by Test size : 5.3001 ms


## LDA with Dimension Reduction (Feature 512) (16.7%)

In [29]:
cig.reduction_method = ['PCA', 512]
cig.info()

Device ── PC
│
├──Dataset
│    └────CIFAR-10
│    └────Train size 80%
│    └────Feature size: 3072
│
├──Method
│    └────LDA
│
├──Dimension reduction
│    └────Method: PCA
│    └────Component size: 512
│    └────Feature Reduction Ratio: 16.7%
│
└──Iteration
    └────10


In [30]:
from sklearn.utils.multiclass import unique_labels

avg_acc = []
avg_pca_time = []
avg_train_project_time = []
avg_test_project_time = []
avg_time = []

max_seed = cig.iter

for seed in range(max_seed):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=cig.test_size, random_state=seed, shuffle=True)

    lda = LinearDiscriminantAnalysis()  # default = svd

    pca = PCA(n_components=cig.reduction_method[1], random_state=seed)
    pca_start = time.perf_counter()
    pca.fit(x_train)
    pca_end = time.perf_counter() - pca_start
    avg_pca_time.append(pca_end)

    pca_p_start = time.perf_counter()
    x_train = pca.transform(x_train)
    pca_p_end = time.perf_counter() - pca_p_start
    avg_train_project_time.append(pca_p_end)

    lda.fit(x_train, y_train)

    pca_p_start = time.perf_counter()
    x_test = pca.transform(x_test)
    pca_p_end = time.perf_counter() - pca_p_start
    avg_test_project_time.append(pca_p_end)

    start = time.perf_counter()
    test_score = lda.score(x_test, y_test)
    end = time.perf_counter() - start

    avg_acc.append(test_score)
    avg_time.append(end)

mean_lda_time = np.array(avg_pca_time).mean()
mean_train_project = np.array(avg_train_project_time).mean()
mean_test_project = np.array(avg_test_project_time).mean() / len(y_test) * 1e6

mean_acc = np.array(avg_acc).mean()
mean_time = np.array(avg_time).mean() / len(y_test) * 1e6

comparison_acc.append(mean_acc)
comparison_lda_time.append(mean_test_project)
comparison_time.append(mean_time)

cig.print_rutin()
print("-----" * 8)
# print("The number of components : %d(%f%)" % cig.reduction_method[1], cig.reduction_ratio)
print("Calculating Train dataset U*S*Vt Matrix Time : %.4f" % mean_lda_time, "sec")
print("Calculating Train dataset Projection Time : %.4f" % mean_train_project, "sec")
print("All Test dataset Projection Time : %.4f" % (mean_test_project*len(y_test)), "sec")
print("Divide the Projection Time by Test size : %f" % mean_test_project, "ms")
print("-----" * 8)
print("Test set score: %f" % mean_acc)
print("All Test dataset Prediction Average Time at once : %.4f" % (mean_time*len(y_test)), "sec")
print("Divide the Prediction Time by Test size : %.4f" % mean_time, "ms")

PC - CIFAR-10(80%) - LDA - 10 iteration - PCA(feature 512)
----------------------------------------
Calculating Train dataset U*S*Vt Matrix Time : 11.3996 sec
Calculating Train dataset Projection Time : 1.2722 sec
All Test dataset Projection Time : 271927.0200 sec
Divide the Projection Time by Test size : 27.192702 ms
----------------------------------------
Test set score: 0.410760
All Test dataset Prediction Average Time at once : 14529.3400 sec
Divide the Prediction Time by Test size : 1.4529 ms


## LDA with Dimension Reduction (Feature 9) (0.29%)

In [11]:
cig.reduction_method = ['PCA', 9]
cig.info()

Device ── PC
│
├──Dataset
│    └────CIFAR-10
│    └────Train size 80%
│    └────Feature size: 3072
│
├──Method
│    └────LDA
│
├──Dimension reduction
│    └────Method: PCA
│    └────Component size: 9
│    └────Feature Reduction Ratio: 0.3%
│
└──Iteration
    └────10


In [18]:
from sklearn.utils.multiclass import unique_labels

avg_acc = []
avg_pca_time = []
avg_train_project_time = []
avg_test_project_time = []
avg_time = []

max_seed = cig.iter

for seed in range(max_seed):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=cig.test_size, random_state=seed, shuffle=True)

    lda = LinearDiscriminantAnalysis()  # default = svd

    pca = PCA(n_components=cig.reduction_method[1], random_state=seed)
    pca_start = time.perf_counter()
    pca.fit(x_train)
    pca_end = time.perf_counter() - pca_start
    avg_pca_time.append(pca_end)

    pca_p_start = time.perf_counter()
    x_train = pca.transform(x_train)
    pca_p_end = time.perf_counter() - pca_p_start
    avg_train_project_time.append(pca_p_end)

    lda.fit(x_train, y_train)

    pca_p_start = time.perf_counter()
    x_test = pca.transform(x_test)
    pca_p_end = time.perf_counter() - pca_p_start
    avg_test_project_time.append(pca_p_end)

    start = time.perf_counter()
    test_score = lda.score(x_test, y_test)
    end = time.perf_counter() - start

    avg_acc.append(test_score)
    avg_time.append(end)

mean_lda_time = np.array(avg_pca_time).mean()
mean_train_project = np.array(avg_train_project_time).mean()
mean_test_project = np.array(avg_test_project_time).mean() / len(y_test) * 1e6

mean_acc = np.array(avg_acc).mean()
mean_time = np.array(avg_time).mean() / len(y_test) * 1e6

comparison_acc.append(mean_acc)
comparison_lda_time.append(mean_test_project)
comparison_time.append(mean_time)

cig.print_rutin()
print("-----" * 8)
# print("The number of components : %d(%f%)" % cig.reduction_method[1], cig.reduction_ratio)
print("Calculating Train dataset U*S*Vt Matrix Time : %.4f" % mean_lda_time, "sec")
print("Calculating Train dataset Projection Time : %.4f" % mean_train_project, "sec")
print("All Test dataset Projection Time : %.4f" % (mean_test_project*len(y_test)), "sec")
print("Divide the Projection Time by Test size : %f" % mean_test_project, "ms")
print("-----" * 8)
print("Test set score: %f" % mean_acc)
print("All Test dataset Prediction Average Time at once : %.4f" % (mean_time*len(y_test)), "sec")
print("Divide the Prediction Time by Test size : %.4f" % mean_time, "ms")

PC - CIFAR-10(80%) - LDA - 10 iteration
----------------------------------------
Calculating Train dataset U*S*Vt Matrix Time : 25.0434 sec
Calculating Train dataset Projection Time : 2.5932 sec
All Test dataset Projection Time : 636338.9500 sec
Divide the Projection Time by Test size : 63.633895 ms
----------------------------------------
Test set score: 0.360800
All Test dataset Prediction Average Time at once : 33161.9800 sec
Divide the Prediction Time by Test size : 3.3162 ms


## LDA with LDA Dimension Reduction (Feature 9)

In [19]:
cig.reduction_method = ['LDA', 9]
cig.info()

Device ── PC
│
├──Dataset
│    └────CIFAR-10
│    └────Train size 80%
│    └────Feature size: 3072
│
├──Method
│    └────LDA
│
├──Dimension reduction
│    └────Method: LDA
│    └────Component size: 9
│    └────Feature Reduction Ratio: 0.3%
│
└──Iteration
    └────10


In [20]:
from sklearn.utils.multiclass import unique_labels

avg_acc = []
avg_lda_time = []
avg_train_project_time = []
avg_test_project_time = []
avg_time = []

max_seed = cig.iter

for seed in range(max_seed):
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=cig.test_size, random_state=seed, shuffle=True)

    lda = LinearDiscriminantAnalysis(n_components=cig.reduction_method[1])  # default = svd

    lda_start = time.perf_counter()
    lda.fit(x_train, y_train)
    lda_end = time.perf_counter() - lda_start
    avg_lda_time.append(lda_end)

    lda_p_start = time.perf_counter()
    x_train = lda.transform(x_train)
    lda_p_end = time.perf_counter() - lda_p_start
    avg_train_project_time.append(lda_p_end)

    knn = KNeighborsClassifier(1, weights='distance', n_jobs=-1)
    knn.fit(x_train, y_train)

    lda_p_start = time.perf_counter()
    x_test = lda.transform(x_test)
    lda_p_end = time.perf_counter() - lda_p_start
    avg_test_project_time.append(lda_p_end)

    start = time.perf_counter()
    test_score = knn.score(x_test, y_test)
    end = time.perf_counter() - start

    avg_acc.append(test_score)
    avg_time.append(end)


mean_lda_time = np.array(avg_lda_time).mean()
mean_train_project = np.array(avg_train_project_time).mean()
mean_test_project = np.array(avg_test_project_time).mean() / len(y_test) * 1e6

mean_acc = np.array(avg_acc).mean()
mean_time = np.array(avg_time).mean() / len(y_test) * 1e6

comparison_acc.append(mean_acc)
comparison_lda_time.append(mean_test_project)
comparison_time.append(mean_time)

cig.print_rutin()
print("-----" * 8)
# print("The number of components : %d(%f%)" % cig.reduction_method[1], cig.reduction_ratio)
print("Calculating Train dataset U*S*Vt Matrix Time : %.4f" % mean_lda_time, "sec")
print("Calculating Train dataset Projection Time : %.4f" % mean_train_project, "sec")
print("All Test dataset Projection Time : %.4f" % (mean_test_project*len(y_test)), "sec")
print("Divide the Projection Time by Test size : %f" % mean_test_project, "ms")
print("-----" * 8)
print("Test set score: %f" % mean_acc)
print("All Test dataset Prediction Average Time at once : %.4f" % (mean_time*len(y_test)), "sec")
print("Divide the Prediction Time by Test size : %.4f" % mean_time, "ms")

PC - CIFAR-10(80%) - LDA - 10 iteration - LDA(feature 9)
----------------------------------------
Calculating Train dataset U*S*Vt Matrix Time : 29.5844 sec
Calculating Train dataset Projection Time : 0.4026 sec
All Test dataset Projection Time : 84439.6600 sec
Divide the Projection Time by Test size : 8.443966 ms
----------------------------------------
Test set score: 0.285490
All Test dataset Prediction Average Time at once : 508294.7000 sec
Divide the Prediction Time by Test size : 50.8295 ms
