In [None]:
import os
import numpy as np
import warnings

warnings.filterwarnings('ignore')

# set names to the paths because they're too long
data_path = 'D:\\Subjects_In_University\\Machine_Learning\\data\\KNN_Multinomial_Logistic_Regression'
# train path
train_images_path = os.path.join(data_path, 'train-images-idx3-ubyte.gz')
train_labels_path = os.path.join(data_path, 'train-labels-idx1-ubyte.gz')
# test path
test_images_path = os.path.join(data_path, 't10k-images-idx3-ubyte.gz')
test_labels_path = os.path.join(data_path, 't10k-labels-idx1-ubyte.gz')

In [45]:
def get_mnist_data(images_path, labels_path, num_images, shuffle=False, _is=True, image_size=28):
    """
    This shuffle param is active when .gz is downloaded at:
    - 'http://yann.lecun.com/exdb/mnist/'
    - This function return random num_images in 60000 or 10000
    """
    # read data
    import gzip # to decompress gz (zip) file
    
    # open file training to read training data
    f_images = gzip.open(images_path,'r')
    
    # skip 16 first bytes because these are not data, only header infor
    f_images.read(16)
    
    # general: read num_images data samples if this parameter is set;
    # if not, read all (60000 training or 10000 test)
    
    real_num = num_images if not shuffle else (60000 if _is else 10000)
    
    # read all data to buf_images (28x28xreal_num)
    buf_images = f_images.read(image_size * image_size * real_num)
    
    # images
    images = np.frombuffer(buf_images, dtype=np.uint8).astype(np.float32)
    images = images.reshape(real_num, image_size, image_size,)
    
    # Read labels
    f_labels = gzip.open(labels_path,'r')
    f_labels.read(8)
    
    labels = np.zeros((real_num)).astype(np.int64)
    
    # rearrange to correspond the images and labels
    for i in range(0, real_num):
        buf_labels = f_labels.read(1)
        labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)
    
    # shuffle to get random images data
    if shuffle is True:
        rand_id = np.random.randint(real_num, size=num_images)
    
        images = images[rand_id, :]
        labels = labels[rand_id,]
    
    # change images data to type of vector 28x28 dimentional
    images = images.reshape(num_images, image_size * image_size)
    return images, labels

In [46]:
train_images, train_labels = get_mnist_data(
    train_images_path, train_labels_path, 5000, shuffle=True)

test_images, test_labels = get_mnist_data(
    test_images_path, test_labels_path, 10000, _is=False, shuffle=True)

print(train_images.shape, train_labels.shape)
print(test_images.shape, test_labels.shape)

(5000, 784) (5000,)
(10000, 784) (10000,)


In [47]:
import pandas as pd
import time
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [48]:

train_images = pd.DataFrame(train_images)
train_labels = pd.DataFrame(train_labels)
test_images = pd.DataFrame(test_images)
test_labels = pd.DataFrame(test_labels)

In [49]:
X = pd.concat([train_images, test_images], axis=0)
X.shape

(15000, 784)

In [50]:
y = pd.concat([train_labels, test_labels], axis=0)
y.shape

(15000, 1)

In [51]:
df = pd.concat([X, y], axis=1)
df.shape

(15000, 785)

In [52]:
df_5000 = df.sample(5000, random_state=18)
df_5000.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,0.1
4544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
1219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
5157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
9031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9


In [53]:
X_1 = df_5000.iloc[:, :-1]
y_1 = df_5000.iloc[:, -1]

In [54]:
print(X_1.shape, y_1.shape)

(5000, 784) (5000,)


### Multinomial Logistic Regression with original dataset

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.3, random_state=18)

logR = LogisticRegression(multi_class='multinomial', solver='sag')

start_time = time.time()

logR.fit(X_train, y_train)

y_pred = logR.predict(X_test)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time: {elapsed_time:.4f} s")
print('Accuracy:', accuracy_score(y_test, y_pred))

Time: 4.8107 s
Accuracy: 0.8966666666666666


### Multinomial Logistic Regression with PCA dataset

**Reduce dimension of data then divide to train and test**

In [59]:
pca = PCA(n_components=100)

X_pca_divide = pca.fit_transform(X_1)

X_pca_divide_train, X_pca_divide_test, y_pca_divide_train, y_pca_divide_test = train_test_split(X_pca_divide, y_1, test_size=0.3, random_state=18)

start_time = time.time()

logR.fit(X_pca_divide_train, y_pca_divide_train)

y_pca_divide_pred = logR.predict(X_pca_divide_test)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time: {elapsed_time:.4f} s")
print('Accuracy:', accuracy_score(y_pca_divide_test, y_pca_divide_pred))

Time: 0.6308 s
Accuracy: 0.8793333333333333


**Divide to train and test then reduce dimension of data**

In [60]:
X_divide_pca_train = pca.fit_transform(X_train)
X_divide_pca_test = pca.fit_transform(X_test)

start_time = time.time()
logR.fit(X_divide_pca_train, y_train)

y_divide_pca_pred = logR.predict(X_divide_pca_test)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time: {elapsed_time:.4f} s")
print('Accuracy:', accuracy_score(y_test, y_divide_pca_pred))

Time: 0.6273 s
Accuracy: 0.31


**Nhận xét:**

* Chạy với dữ liệu nguyên bản thì độ chính xác là cao nhất nhưng sẽ tốn nhiều thời gian hơn
* Chạy với dữ liệu giảm chiều sau đó chia train test thì độ chính xác giảm đi một chút nhưng thời gian chạy giảm đi rất nhiều so với dữ liệu nguyên bản
* Chạy với dữ liệu chia train test sau đó giảm chiều thì độ chính xác rất thấp, thời gian chạy thì cũng khá nhanh

**=> Chọn phương pháp: Giảm chiều dữ liệu sau đó chia train test**