# Preprocessing:

In [1]:
import os
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import KFold


In [2]:
root_dir = "./10000/"

X = []
y = []

In [3]:


for label in range(10):
    folder_path = os.path.join(root_dir, str(label))
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg") or filename.endswith(".jpeg"):
            img_path = os.path.join(folder_path, filename)

          
            img = Image.open(img_path)
            img = img.convert("L")
            img = img.resize((28, 28))
            img_array = np.array(img)
            img_flat = img_array.flatten()

            X.append(img_flat)
            y.append(label)

X = np.array(X)
y = np.array(y)

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (10000, 784)
y shape: (10000,)


In [4]:
X = X / 255.0

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Train size: (8000, 784)
Test size: (2000, 784)


# Linear regression from scratch :

## Model class:

In [17]:
class LinearRegressionGD:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.lr = learning_rate
        self.epochs = epochs
        self.theta = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        X = np.c_[np.ones(n_samples), X]
        self.theta = np.zeros(n_features + 1) 

        for _ in range(self.epochs):
         
            y_pred = X.dot(self.theta)
            error = y_pred - y
            gradient = (2 / n_samples) * X.T.dot(error)
            self.theta = self.theta - self.lr * gradient

    def predict(self, X):
        n_samples = X.shape[0]
        X = np.c_[np.ones(n_samples), X]
        return X.dot(self.theta)


## Model Training and evluation:

In [18]:
models = []
for digit in range(10):
    y_binary = (y_train == digit).astype(int)
    model = LinearRegressionGD(
        learning_rate=0.0001,
        epochs=1000
    )
    model.fit(X_train, y_binary)
    models.append(model)
scores = np.array([m.predict(X_test) for m in models])
y_pred = np.argmax(scores, axis=0)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred,average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)

Accuracy: 0.6265
Precision: 0.6719
Recall: 0.6265
F1-score: 0.6262

Confusion Matrix:
[[153  17   0   2  10   2   5   3   0   8]
 [  1 164   2   5   3   0  14   5   0   6]
 [  6  34  97  10   8   2  11  13   2  17]
 [ 14  13  11 132  13   2   0   2   3  10]
 [ 20  31   1   3 117   0  10   4   5   9]
 [ 11  14   1   3   7 133  21   3   4   3]
 [  4  23   0   1   0  12 157   0   3   0]
 [  1  60   0   1   9   2   8 101   1  17]
 [ 13  37   0  12  15  12  17  15  71   8]
 [ 11  16   1  12   9   0   0  22   1 128]]


## k_fold for Linear Regression model

In [13]:

kf_linear = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []

for train_idx, test_idx in kf_linear.split(X):
    X_train_fold, X_test_fold = X[train_idx], X[test_idx]
    y_train_fold, y_test_fold = y[train_idx], y[test_idx]

    models = []
    for digit in range(10):
        y_binary = (y_train_fold == digit).astype(int)
        model = LinearRegressionGD(
        learning_rate=0.0001,
        epochs=1000
                     )
        model.fit(X_train_fold, y_binary)
        models.append(model)

   
    scores = np.array([m.predict(X_test_fold) for m in models])
    y_pred = np.argmax(scores, axis=0)


    fold_acc = accuracy_score(y_test_fold, y_pred)
    print(f"Fold accuracy: {fold_acc:.4f}")
    cv_scores.append(fold_acc)

print("Mean :",np.mean(cv_scores))
print("standard deviation :",np.std(cv_scores))


Fold accuracy: 0.5970
Fold accuracy: 0.6050
Fold accuracy: 0.6120
Fold accuracy: 0.5655
Fold accuracy: 0.6115
Mean : 0.5982000000000001
standard deviation : 0.017229625648864228


# Logistic regression from scratch :

## Model Class:

In [19]:
class LogisticRegressionGD:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.lr = learning_rate
        self.epochs = epochs
        self.theta = None

    def sigmoid(self, z):
  
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        X = np.c_[np.ones(n_samples), X]
        self.theta = np.zeros(n_features + 1)

        for _ in range(self.epochs):
            z = X.dot(self.theta)
            y_pred = self.sigmoid(z)
            gradient = (1 / n_samples) * X.T.dot(y_pred - y)
            self.theta -= self.lr * gradient

    def predict(self, X):
        X = np.c_[np.ones(X.shape[0]), X]
        z = X.dot(self.theta)
        return self.sigmoid(z)


## Model Training and evluation:

In [20]:
models = []

for digit in range(10):
    y_binary = (y_train == digit).astype(int)

    model = LogisticRegressionGD(
        learning_rate=0.1,
        epochs=1000
    )

    model.fit(X_train, y_binary)
    models.append(model)
scores = np.array([m.predict(X_test) for m in models])
y_pred = np.argmax(scores, axis=0)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred,average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)

Accuracy: 0.7410
Precision: 0.7427
Recall: 0.7410
F1-score: 0.7398

Confusion Matrix:
[[158  12   0   3   9   4   6   0   3   5]
 [  3 157   5   6   2   3   5  10   7   2]
 [  2   7 146  10   7   1   2   8   6  11]
 [  7   5   8 154  10   2   0   3   6   5]
 [  4  10   3   3 151   2   9   5   7   6]
 [  9   6   1   4   5 144  19   3   6   3]
 [  0  13   0   1   1  11 169   1   4   0]
 [  1  11   8   3   9   2   4 135   5  22]
 [ 11   9   4   7  12  17   9  13 114   4]
 [  5   2   4   8  12   0   0  13   2 154]]


## k_fold for Logistic Regression model

In [21]:

kf_linear = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []

for train_idx, test_idx in kf_linear.split(X):
    X_train_fold, X_test_fold = X[train_idx], X[test_idx]
    y_train_fold, y_test_fold = y[train_idx], y[test_idx]

    models = []
    for digit in range(10):
        y_binary = (y_train_fold == digit).astype(int)
        model = LogisticRegressionGD(
        learning_rate=0.1,
        epochs=1000
                     )
        model.fit(X_train_fold, y_binary)
        models.append(model)

   
    scores = np.array([m.predict(X_test_fold) for m in models])
    y_pred = np.argmax(scores, axis=0)


    fold_acc = accuracy_score(y_test_fold, y_pred)
    print(f"Fold accuracy: {fold_acc:.4f}")
    cv_scores.append(fold_acc)

print("Mean :",np.mean(cv_scores))
print("standard deviation :",np.std(cv_scores))


Fold accuracy: 0.7310
Fold accuracy: 0.7375
Fold accuracy: 0.7240
Fold accuracy: 0.7445
Fold accuracy: 0.7400
Mean : 0.7353999999999999
standard deviation : 0.00717913643831905


# Naive Bayes from scratch :

## Model Class:

In [6]:
import numpy as np

class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        n_features = X.shape[1]

        self.mean = {}
        self.var = {}
        self.prior = {}

        for c in self.classes:
            X_c = X[y == c]
            self.mean[c] = X_c.mean(axis=0)
            self.var[c] = X_c.var(axis=0) + 1e-9
            self.prior[c] = X_c.shape[0] / X.shape[0]

    def predict(self, X):
        y_pred = []

        for x in X:
            posteriors = []

            for c in self.classes:
                log_prior = np.log(self.prior[c])
                log_likelihood = -0.5 * np.sum(
                    np.log(2 * np.pi * self.var[c]) +
                    ((x - self.mean[c]) ** 2) / self.var[c]
                )
                posteriors.append(log_prior + log_likelihood)

            y_pred.append(self.classes[np.argmax(posteriors)])

        return np.array(y_pred)


## Model Training and evluation:

In [7]:
model = GaussianNaiveBayes()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.5115
Precision: 0.5433
Recall: 0.5115
F1-score: 0.5144

Confusion Matrix:
[[118  26   3   8  15   7  11   2   2   8]
 [  2 133   2  19   5   5   7  20   4   3]
 [  4  40  70  42   6   1   1  11  16   9]
 [ 11  25  11 122   7   2   0   5   6  11]
 [ 18  28   8   3  86   1  10   7  23  16]
 [ 12  45   9   3   6  89  14   5  12   5]
 [ 14  38   2   2   3  13 117   4   6   1]
 [  6  35   6  12   9   2   2 100  13  15]
 [ 13  31   6  19  12  17  10   7  77   8]
 [ 12  21   6  18  10   1   0  15   6 111]]


## k_fold for Logistic Regression model

In [10]:

kf_linear = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []

for train_idx, test_idx in kf_linear.split(X):
    X_train_fold, X_test_fold = X[train_idx], X[test_idx]
    y_train_fold, y_test_fold = y[train_idx], y[test_idx]

    model = GaussianNaiveBayes()
    model.fit(X_train_fold, y_train_fold)
    y_pred = model.predict(X_test_fold)

    fold_acc = accuracy_score(y_test_fold, y_pred)
    print(f"Fold accuracy: {fold_acc:.4f}")
    cv_scores.append(fold_acc)

print("Mean :",np.mean(cv_scores))
print("standard deviation :",np.std(cv_scores))


Fold accuracy: 0.4995
Fold accuracy: 0.5095
Fold accuracy: 0.5020
Fold accuracy: 0.5270
Fold accuracy: 0.5045
Mean : 0.5085
standard deviation : 0.009823441352194262
