In [2]:
import os
import numpy as np
import warnings
import pandas as pd
import time

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

warnings.filterwarnings('ignore')

Đường dẫn dữ liệu

In [3]:
# set names to the paths because they're too long
data_path = 'D:\\Subjects_In_University\\Machine_Learning\\data\\KNN_Multinomial_Logistic_Regression'
# train path
train_images_path = os.path.join(data_path, 'train-images-idx3-ubyte.gz')
train_labels_path = os.path.join(data_path, 'train-labels-idx1-ubyte.gz')
# test path
test_images_path = os.path.join(data_path, 't10k-images-idx3-ubyte.gz')
test_labels_path = os.path.join(data_path, 't10k-labels-idx1-ubyte.gz')

Hàm giải nén dữ liệu.

In [4]:
def get_mnist_data(images_path, labels_path, num_images, shuffle=False, _is=True, image_size=28):
    """
    This shuffle param is active when .gz is downloaded at:
    - 'http://yann.lecun.com/exdb/mnist/'
    - This function return random num_images in 60000 or 10000
    """
    # read data
    import gzip # to decompress gz (zip) file
    
    # open file training to read training data
    f_images = gzip.open(images_path,'r')
    
    # skip 16 first bytes because these are not data, only header infor
    f_images.read(16)
    
    # general: read num_images data samples if this parameter is set;
    # if not, read all (60000 training or 10000 test)
    
    real_num = num_images if not shuffle else (60000 if _is else 10000)
    
    # read all data to buf_images (28x28xreal_num)
    buf_images = f_images.read(image_size * image_size * real_num)
    
    # images
    images = np.frombuffer(buf_images, dtype=np.uint8).astype(np.float32)
    images = images.reshape(real_num, image_size, image_size,)
    
    # Read labels
    f_labels = gzip.open(labels_path,'r')
    f_labels.read(8)
    
    labels = np.zeros((real_num)).astype(np.int64)
    
    # rearrange to correspond the images and labels
    for i in range(0, real_num):
        buf_labels = f_labels.read(1)
        labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)
    
    # shuffle to get random images data
    if shuffle is True:
        rand_id = np.random.randint(real_num, size=num_images)
    
        images = images[rand_id, :]
        labels = labels[rand_id,]
    
    # change images data to type of vector 28x28 dimentional
    images = images.reshape(num_images, image_size * image_size)
    return images, labels

Lấy ra tập train và tập test

In [5]:
train_images, train_labels = get_mnist_data(
    train_images_path, train_labels_path, 5000, shuffle=True)

test_images, test_labels = get_mnist_data(
    test_images_path, test_labels_path, 10000, _is=False, shuffle=True)

print(train_images.shape, train_labels.shape)
print(test_images.shape, test_labels.shape)

(5000, 784) (5000,)
(10000, 784) (10000,)


Chuyển tập train và tập test thành một DataFrame.

In [6]:
train_images = pd.DataFrame(train_images)
train_labels = pd.DataFrame(train_labels)
test_images = pd.DataFrame(test_images)
test_labels = pd.DataFrame(test_labels)

X = pd.concat([train_images, test_images], axis=0)
y = pd.concat([train_labels, test_labels], axis=0)

df = pd.concat([X, y], axis=1)
df_5000 = df.sample(5000, random_state=18)

In [7]:
X_5000 = df_5000.iloc[:, :-1]
y_5000 = df_5000.iloc[:, -1]

Khởi tạo `PCA` giảm chiều dữ liệu về 100

In [8]:
pca = PCA(n_components=100)

X_pca = pca.fit_transform(X_5000)

Khởi tạo mô hình Logistic Regression

In [9]:
logR = LogisticRegression(multi_class='multinomial', solver='sag')

Áp dụng phương pháp Multinomial Logistic Regression để phân loại. 

Tỷ lệ train:test là 0.7:0.3.

So sánh độ chính xác và thời gian chạy mô hình trong các trường hợp:

**Dữ liệu nguyên bản**

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_5000, y_5000, test_size=0.3, random_state=18)

start_time = time.time()

logR.fit(X_train, y_train)
y_pred = logR.predict(X_test)

end_time = time.time()

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Run time:", end_time - start_time)

Accuracy: 0.8926666666666667
Run time: 4.6331627368927


**Dữ liệu đã PCA về 100 chiều**

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_5000, test_size=0.3, random_state=18)

start_time = time.time()

logR.fit(X_train, y_train)
y_pred = logR.predict(X_test)

end_time = time.time()

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Run time:", end_time - start_time)

Accuracy: 0.884
Run time: 0.5869491100311279


**Dữ liệu đã LDA về 8 chiều**

In [12]:
lda = LinearDiscriminantAnalysis(n_components=8)

X_lda = lda.fit_transform(X_5000, y_5000)

X_train, X_test, y_train, y_test = train_test_split(X_lda, y_5000, test_size=0.3, random_state=18)

start_time = time.time()

logR.fit(X_train, y_train)
y_pred = logR.predict(X_test)

end_time = time.time()

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Run time:", end_time - start_time)

Accuracy: 0.932
Run time: 0.06802797317504883


Từ kết quả chạy ta thấy dữ liệu sau khi giảm chiều bằng phương pháp LDA xong chia dữ liệu cho độ chính xác cao hơn so với sử dụng phương pháp PCA và dữ liệu ban đầu. 

=> Nên phương pháp LDA là phù hợp nhất.