# Preprocessing:

In [1]:
import os
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.linear_model import SGDRegressor


In [2]:
root_dir = "./10000/"

X = []
y = []

In [3]:


for label in range(10):
    folder_path = os.path.join(root_dir, str(label))
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg") or filename.endswith(".jpeg"):
            img_path = os.path.join(folder_path, filename)

          
            img = Image.open(img_path)
            img = img.convert("L")
            img = img.resize((28, 28))
            img_array = np.array(img)
            img_flat = img_array.flatten()

            X.append(img_flat)
            y.append(label)

X = np.array(X)
y = np.array(y)

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (10000, 784)
y shape: (10000,)


In [4]:
X = X / 255.0

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Train size: (8000, 784)
Test size: (2000, 784)


# Base Linear Regression model:

In [16]:

models = []
for digit in range(10):
        y_binary = (y_train == digit).astype(int)
        model = SGDRegressor(
                loss="squared_error",
                learning_rate="constant",
                eta0=0.01,
                max_iter=1000)
        model.fit(X_train, y_binary)
        models.append(model)

scores = np.array([m.predict(X_test) for m in models])
y_pred = np.argmax(scores, axis=0)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred,average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)

Accuracy: 0.1040
Precision: 0.0403
Recall: 0.1040
F1-score: 0.0309

Confusion Matrix:
[[  0   0 159  16   0   3   0  14   3   5]
 [  0   0 185   3   0   0   0  10   1   1]
 [  0   0 194   0   0   0   0   6   0   0]
 [  0   0 164   3   0   2   0  25   5   1]
 [  0   0 179   3   0  12   1   4   1   0]
 [  0   0 180   1   0   1   0   6  11   1]
 [  0   0 151  16   0   8   0  10  11   4]
 [  0   0 196   0   0   1   0   3   0   0]
 [  0   0 171   9   0   3   0  10   7   0]
 [  0   0 185   1   0   4   1   8   1   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# k_fold for Linear Regression model

In [17]:

kf_linear = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []

for train_idx, test_idx in kf_linear.split(X):
    X_train_fold, X_test_fold = X[train_idx], X[test_idx]
    y_train_fold, y_test_fold = y[train_idx], y[test_idx]

    models = []
    for digit in range(10):
        y_binary = (y_train_fold == digit).astype(int)
        model = SGDRegressor(
                loss="squared_error",
                learning_rate="constant",
                eta0=0.01,
                max_iter=1000)
        model.fit(X_train_fold, y_binary)
        models.append(model)

   
    scores = np.array([m.predict(X_test_fold) for m in models])
    y_pred = np.argmax(scores, axis=0)


    fold_acc = accuracy_score(y_test_fold, y_pred)
    print(f"Fold accuracy: {fold_acc:.4f}")
    cv_scores.append(fold_acc)

print("Mean :",np.mean(cv_scores))
print("standard deviation :",np.std(cv_scores))


Fold accuracy: 0.0705
Fold accuracy: 0.1005
Fold accuracy: 0.0910
Fold accuracy: 0.0995
Fold accuracy: 0.0945
Mean : 0.09120000000000002
standard deviation : 0.010906878563548791


# Base LogisticRegression model:

In [27]:
model = LogisticRegression(
        max_iter=500
    )
    
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.7540
Precision: 0.7540
Recall: 0.7540
F1-score: 0.7529

Confusion Matrix:
[[158   9   1   1   4   6   9   2   4   6]
 [  6 165   4   5   0   3   0  11   4   2]
 [  5   8 145  11   5   6   1   8   4   7]
 [  6   5  12 147   6   7   0   6   3   8]
 [  7   5   4   3 145   5   7   5   9  10]
 [  6   2   5   2   2 156  15   4   7   1]
 [  0   8   1   0   2  15 168   0   6   0]
 [  1   9   8   2   4   0   1 150   7  18]
 [  9   5   5   7  16  10   7  12 121   8]
 [  9   2   4   8   9   0   0  13   2 153]]


# k_fold for Logistic Regression model:

In [None]:


kf_logistic = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores_logistic = []

for train_idx, test_idx in kf_logistic.split(X):
    X_train_fold, X_test_fold = X[train_idx], X[test_idx]
    y_train_fold, y_test_fold = y[train_idx], y[test_idx]

    model = LogisticRegression(
        max_iter=500
    )
    
    model.fit(X_train_fold, y_train_fold)

    y_pred = model.predict(X_test_fold)
    fold_acc = accuracy_score(y_test_fold, y_pred)
    print(f"Fold accuracy: {fold_acc:.4f}")
    cv_scores_logistic.append(fold_acc)
    
print("Mean :",np.mean(cv_scores_logistic))
print("standard deviation :",np.std(cv_scores_logistic))


Fold accuracy: 0.7620
Fold accuracy: 0.7525
Fold accuracy: 0.7470
Fold accuracy: 0.7605
Fold accuracy: 0.7540
Mean : 0.7552
standard deviation : 0.005482700064749119


# Base Naive Bayes model:

In [29]:
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.5115
Precision: 0.5433
Recall: 0.5115
F1-score: 0.5144

Confusion Matrix:
[[118  26   3   8  15   7  11   2   2   8]
 [  2 133   2  19   5   5   7  20   4   3]
 [  4  40  70  42   6   1   1  11  16   9]
 [ 11  25  11 122   7   2   0   5   6  11]
 [ 18  28   8   3  86   1  10   7  23  16]
 [ 12  45   9   3   6  89  14   5  12   5]
 [ 14  38   2   2   3  13 117   4   6   1]
 [  6  35   6  12   9   2   2 100  13  15]
 [ 13  31   6  19  12  17  10   7  77   8]
 [ 12  21   6  18  10   1   0  15   6 111]]


# k_fold for Naive Bayes model:

In [None]:

kf_naive = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores_nb = []

for train_idx, test_idx in kf_naive.split(X):
    X_train_fold, X_test_fold = X[train_idx], X[test_idx]
    y_train_fold, y_test_fold = y[train_idx], y[test_idx]

    model = GaussianNB()
    model.fit(X_train_fold, y_train_fold)
    y_pred = model.predict(X_test_fold)


    fold_acc = accuracy_score(y_test_fold, y_pred)
    print(f"Fold accuracy: {fold_acc:.4f}")
    cv_scores_nb.append(fold_acc)

print("Mean :",np.mean(cv_scores_nb))
print("standard deviation :",np.std(cv_scores_nb))

Fold accuracy: 0.4995
Fold accuracy: 0.5095
Fold accuracy: 0.5020
Fold accuracy: 0.5270
Fold accuracy: 0.5045
Mean : 0.5085
standard deviation : 0.009823441352194262
