In [3]:
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import gzip

In [4]:
data_path = 'D:\\Subjects_In_University\\Machine_Learning\\data\\KNN_Multinomial_Logistic_Regression'
train_images_path = os.path.join(data_path, 'train-images-idx3-ubyte.gz')
train_labels_path = os.path.join(data_path, 'train-labels-idx1-ubyte.gz')
test_images_path = os.path.join(data_path, 't10k-images-idx3-ubyte.gz')
test_labels_path = os.path.join(data_path, 't10k-labels-idx1-ubyte.gz')

def get_mnist_data(images_path, labels_path, num_images, shuffle=False, _is=True, image_size=28):
    f_images = gzip.open(images_path, 'r')
    f_images.read(16)
    real_num = num_images if not shuffle else (60000 if _is else 10000)
    buf_images = f_images.read(image_size * image_size * real_num)
    images = np.frombuffer(buf_images, dtype=np.uint8).astype(np.float32)
    images = images.reshape(real_num, image_size, image_size)
    
    f_labels = gzip.open(labels_path, 'r')
    f_labels.read(8)
    labels = np.zeros((real_num)).astype(np.int64)
    for i in range(0, real_num):
        buf_labels = f_labels.read(1)
        labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)
    
    if shuffle:
        rand_id = np.random.randint(real_num, size=num_images)
        images = images[rand_id, :]
        labels = labels[rand_id]
    
    images = images.reshape(num_images, image_size * image_size)
    return images, labels

In [5]:
train_images, train_labels = get_mnist_data(train_images_path, train_labels_path, 1000, shuffle=True)
test_images, test_labels = get_mnist_data(test_images_path, test_labels_path, 500, _is=False, shuffle=True)
print("Kích thước tập huấn luyện:", train_images.shape, train_labels.shape)
print("Kích thước tập kiểm tra:", test_images.shape, test_labels.shape)

scaler = StandardScaler()
X_train = scaler.fit_transform(train_images)  
X_test = scaler.transform(test_images)     
y_train = train_labels
y_test = test_labels

Kích thước tập huấn luyện: (1000, 784) (1000,)
Kích thước tập kiểm tra: (500, 784) (500,)


  labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)


In [6]:
clf = SVC(kernel='linear', C=1.0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Độ chính xác trên tập kiểm tra: {accuracy:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)
print("Ma trận nhầm lẫn:\n", conf_matrix)

Độ chính xác trên tập kiểm tra: 0.8780
Ma trận nhầm lẫn:
 [[50  0  0  0  0  0  2  0  0  0]
 [ 0 51  0  1  0  0  0  0  1  0]
 [ 1  2 46  3  2  0  3  0  0  2]
 [ 0  2  1 42  0  2  0  2  0  0]
 [ 0  0  0  0 35  0  0  1  0  0]
 [ 1  0  0  0  0 44  1  1  0  0]
 [ 1  0  0  0  0  1 34  0  1  0]
 [ 0  3  1  0  1  0  0 52  3  4]
 [ 2  1  4  2  1  0  1  1 29  1]
 [ 0  2  0  0  3  0  0  0  0 56]]
