### COMP9517 24T3 Lab 03

Jinghan Wang (z5286124)

In [18]:
import numpy as np
import pandas as pd
import random
from PIL import Image
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
import warnings
warnings.filterwarnings("ignore")

In [19]:
random.seed(42)

#### Load, Split, Reshape Dataset

In [20]:
def load_and_reshape_image(image_path):
    image = Image.open(f"dataset/data/data/{image_path}").convert('L')
    image = image.resize((64, 64))
    image_array = np.array(image, dtype=np.float32)
    image_flat = image_array.flatten()
    image_flat /= 255
    return image_flat

In [21]:
data = pd.read_csv('dataset/chinese_mnist.csv')

test = data.groupby('value').apply(lambda x: x.sample(n=66, random_state=1)).reset_index(drop=True)

last_data = data.drop(test.index)
train = last_data.groupby('value').apply(lambda x: x.sample(n=333, replace=True, random_state=1)).reset_index(drop=True)

train_image = np.array([load_and_reshape_image(f"input_{row['suite_id']}_{row['sample_id']}_{row['code']}.jpg") for _, row in train.iterrows()])
train_label = train['value'].values

test_image = np.array([load_and_reshape_image(f"input_{row['suite_id']}_{row['sample_id']}_{row['code']}.jpg") for _, row in test.iterrows()])
test_label = test['value'].values

print("Train image data shape:", train_image.shape)
print("Train labels shape:", train_label.shape)
print("Test image data shape:", test_image.shape)
print("Test labels shape:", test_label.shape)

Train image data shape: (4995, 4096)
Train labels shape: (4995,)
Test image data shape: (990, 4096)
Test labels shape: (990,)


#### KNN

In [22]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(train_image, train_label)
prediction = knn.predict(test_image)

##### Result

In [23]:
print("KNN Classification Report:\n", classification_report(test_label, prediction))

KNN Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.53      0.62        66
           1       0.18      0.95      0.30        66
           2       0.22      0.36      0.27        66
           3       0.30      0.39      0.34        66
           4       0.61      0.30      0.40        66
           5       0.50      0.17      0.25        66
           6       0.34      0.32      0.33        66
           7       0.35      0.12      0.18        66
           8       0.88      0.80      0.84        66
           9       1.00      0.02      0.03        66
          10       0.43      0.58      0.49        66
         100       0.81      0.20      0.32        66
        1000       0.56      0.23      0.32        66
       10000       0.41      0.26      0.32        66
   100000000       0.78      0.21      0.33        66

    accuracy                           0.36       990
   macro avg       0.54      0.36      0.36       99

In [24]:
print("KNN Confusion Matrix:\n", confusion_matrix(test_label, prediction))

KNN Confusion Matrix:
 [[35  6  1  0  4  2  6  2  0  0  5  1  1  3  0]
 [ 0 63  2  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0 33 24  9  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 23 15 26  0  0  0  2  0  0  0  0  0  0  0]
 [ 3 23  9  8 20  1  0  0  0  0  0  1  0  0  1]
 [ 0 11 25 13  0 11  0  0  0  0  3  0  0  3  0]
 [ 1 38  2  2  0  0 21  0  1  0  1  0  0  0  0]
 [ 0 28  2  8  0  2  6  8  0  0  7  0  3  1  1]
 [ 0  9  1  0  0  1  1  0 53  0  0  0  1  0  0]
 [ 3 18  4  3  3  2 13  3  0  1  7  0  0  7  2]
 [ 0 18  2  0  0  0  1  0  0  0 38  0  7  0  0]
 [ 1 15  4  7  2  2  2  3  1  0  7 13  0  9  0]
 [ 1 21  7  1  0  0  4  0  0  0 17  0 15  0  0]
 [ 2 30  4  0  1  1  5  0  4  0  2  0  0 17  0]
 [ 1 20  7  9  3  0  3  5  1  0  1  1  0  1 14]]


#### DT

In [25]:
dt = DecisionTreeClassifier(random_state=1)
dt.fit(train_image, train_label)
prediction = dt.predict(test_image)

##### Result

In [26]:
print("Decision Tree Classification Report:\n", classification_report(test_label, prediction))

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.44      0.44        66
           1       0.63      0.79      0.70        66
           2       0.24      0.26      0.25        66
           3       0.20      0.24      0.22        66
           4       0.32      0.29      0.30        66
           5       0.17      0.17      0.17        66
           6       0.21      0.29      0.24        66
           7       0.21      0.24      0.23        66
           8       0.32      0.35      0.33        66
           9       0.20      0.02      0.03        66
          10       0.35      0.39      0.37        66
         100       0.21      0.18      0.19        66
        1000       0.23      0.21      0.22        66
       10000       0.14      0.14      0.14        66
   100000000       0.18      0.18      0.18        66

    accuracy                           0.28       990
   macro avg       0.27      0.28      0.2

In [27]:
print("Decision Confusion Matrix:\n", confusion_matrix(test_label, prediction))

Decision Confusion Matrix:
 [[29  0  2  0  2  8  6  0  0  0  3  6  3  3  4]
 [ 0 52  3  1  0  0  2  1  5  0  1  0  1  0  0]
 [ 0  9 17 12  2  3  2  4  9  0  2  1  1  3  1]
 [ 4  4 10 16  0  5  7  3  1  1  3  3  7  1  1]
 [ 7  2  3  4 19  4  5  1  2  0  1  3  2  5  8]
 [ 6  2  3  5  2 11  4  6  4  0  4  6  1  8  4]
 [ 4  1  2  5  5  4 19  3  4  0  2  4  3  5  5]
 [ 1  0  5  6  3  6  5 16  3  0  4  3  4  3  7]
 [ 1  3  8  1  2  1  5  6 23  2  1  3  3  3  4]
 [ 4  1  3  6  4  2 10  8  2  1  5  4  3  6  7]
 [ 0  6  1  2  3  2  2  2  5  0 26  1 10  5  1]
 [ 2  1  2  2  5 10  4  6  7  0  4 12  0  7  4]
 [ 2  1  3  5  4  3  4  5  2  1 10  4 14  4  4]
 [ 3  0  3  6  3  4  8  8  2  0  7  3  5  9  5]
 [ 3  0  7  9  5  1  7  6  4  0  1  5  4  2 12]]


#### SGD

In [28]:
sgd = SGDClassifier(max_iter=250, random_state=1)
sgd.fit(train_image, train_label)
prediction = sgd.predict(test_image)

##### Result

In [29]:
print("SGD Classification Report:\n", classification_report(test_label, prediction))

SGD Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.68      0.60        66
           1       0.58      0.80      0.67        66
           2       0.48      0.24      0.32        66
           3       0.39      0.30      0.34        66
           4       0.42      0.53      0.47        66
           5       0.31      0.35      0.33        66
           6       0.33      0.39      0.36        66
           7       0.29      0.36      0.32        66
           8       0.44      0.41      0.42        66
           9       1.00      0.02      0.03        66
          10       0.43      0.50      0.46        66
         100       0.23      0.24      0.23        66
        1000       0.30      0.27      0.29        66
       10000       0.32      0.35      0.33        66
   100000000       0.34      0.35      0.35        66

    accuracy                           0.39       990
   macro avg       0.43      0.39      0.37       99

In [30]:
print("SGD Confusion Matrix:\n", confusion_matrix(test_label, prediction))

SGD Confusion Matrix:
 [[45  0  0  1  1  4  3  0  0  0  2  5  3  1  1]
 [ 0 53  0  2  1  0  1  1  2  0  2  0  2  0  2]
 [ 0 16 16  7  1  2  4  4  3  0  7  4  2  0  0]
 [ 1  3  8 20  1  4  2  2  4  0  2  9  5  3  2]
 [ 0  0  0  2 35  5  1  3  2  0  3  3  2  4  6]
 [ 4  1  0  3  9 23  4  2  2  0  1  6  4  2  5]
 [ 0  3  1  3  1  3 26  3  3  0  2  3  8  7  3]
 [ 0  0  4  3  1  5  2 24  3  0  5  2  4  4  9]
 [ 0  9  1  2  4  1  9  6 27  0  1  2  0  3  1]
 [ 6  0  0  3  5  6  7 11  1  1  1  5  6  5  9]
 [ 3  5  0  1  5  1  2  3  2  0 33  1  2  6  2]
 [11  0  1  0  6 11  2  4  4  0  4 16  1  6  0]
 [ 8  1  0  1  2  1  4  7  1  0 13  4 18  4  2]
 [ 4  1  0  1  4  6  4  8  4  0  1  7  1 23  2]
 [ 1  0  2  2  8  2  7  6  4  0  0  4  2  5 23]]


##### Analyse


- **KNN (36%)**
  KNN relies on neighboring samples. With fewer training samples, it struggles to find enough similar neighbors, leading to lower accuracy.
- **Decision Tree (28%)**
  The decision tree easily overfits on small datasets by memorizing specific patterns, which reduces its generalization on the test set.
- **SGD (39%)**
  SGD uses a linear boundary and is more robust with small samples. It performs slightly better, achieving the highest accuracy.

#### Increase the size of Training Set

In [31]:
train = last_data.groupby('value').apply(lambda x: x.sample(n=666, replace=True, random_state=1)).reset_index(drop=True)
train_image = np.array([load_and_reshape_image(f"input_{row['suite_id']}_{row['sample_id']}_{row['code']}.jpg") for _, row in train.iterrows()])
train_label = train['value'].values

In [32]:
knn.fit(train_image, train_label)
prediction = knn.predict(test_image)
print("KNN Classification Report:\n", classification_report(test_label, prediction))

KNN Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.59      0.69        66
           1       0.21      0.92      0.34        66
           2       0.25      0.36      0.30        66
           3       0.38      0.47      0.42        66
           4       0.59      0.35      0.44        66
           5       0.50      0.24      0.33        66
           6       0.39      0.36      0.38        66
           7       0.53      0.29      0.37        66
           8       0.94      0.92      0.93        66
           9       1.00      0.02      0.03        66
          10       0.49      0.71      0.58        66
         100       0.79      0.23      0.35        66
        1000       0.56      0.36      0.44        66
       10000       0.46      0.39      0.42        66
   100000000       0.82      0.35      0.49        66

    accuracy                           0.44       990
   macro avg       0.58      0.44      0.43       99

In [33]:
dt.fit(train_image, train_label)
prediction = dt.predict(test_image)
print("Decision Tree Classification Report:\n", classification_report(test_label, prediction))

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.48      0.55        66
           1       0.71      0.79      0.75        66
           2       0.33      0.42      0.37        66
           3       0.25      0.33      0.29        66
           4       0.34      0.33      0.34        66
           5       0.29      0.30      0.30        66
           6       0.32      0.36      0.34        66
           7       0.32      0.32      0.32        66
           8       0.58      0.53      0.56        66
           9       0.50      0.02      0.03        66
          10       0.45      0.44      0.44        66
         100       0.26      0.33      0.29        66
        1000       0.37      0.41      0.39        66
       10000       0.25      0.26      0.25        66
   100000000       0.38      0.39      0.39        66

    accuracy                           0.38       990
   macro avg       0.40      0.38      0.3

In [34]:
sgd.fit(train_image, train_label)
prediction = sgd.predict(test_image)
print("SGD Classification Report:\n", classification_report(test_label, prediction))

SGD Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.76      0.72        66
           1       0.59      0.83      0.69        66
           2       0.53      0.29      0.37        66
           3       0.33      0.44      0.37        66
           4       0.48      0.61      0.54        66
           5       0.42      0.44      0.43        66
           6       0.37      0.36      0.37        66
           7       0.40      0.44      0.42        66
           8       0.58      0.65      0.61        66
           9       1.00      0.02      0.03        66
          10       0.52      0.44      0.48        66
         100       0.36      0.38      0.37        66
        1000       0.36      0.32      0.34        66
       10000       0.37      0.33      0.35        66
   100000000       0.31      0.42      0.36        66

    accuracy                           0.45       990
   macro avg       0.49      0.45      0.43       99

##### Analyse

When the training set size doubles, all models see improvement
- KNN benefits from having more neighbors to compare, which enhances accuracy
- The Decision Tree overfits less, allowing it to generalize better.
- For SGD, the extra data helps it define clearer decision boundaries, leading to a more accurate classification overall.