### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

import cv2

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

import joblib

### Collection and exploration of data (Part 1)

In [None]:
mnist = fetch_openml('mnist_784', version=1, cache=True, as_frame=False)
print(mnist.DESCR)

In [None]:
X = mnist["data"]
y = mnist["target"].astype(np.uint8)

In [None]:
# inspect data
print(X.shape)
print(y.shape)
print(X[0])
print(y[0])

In [None]:
# Plotting the features of the X data, it looks like a 5.
some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)
print(some_digit_image)

In [None]:
# Count feature attributes

def count_feature_attribute(my_matrix, control):
    new_matrix = my_matrix.reshape(28, 28)
    
    empty_matrix = []
    count_length_deadspace = 0
    count_length_digit = 0
    
    for idx, num in enumerate(new_matrix):
        if num.sum() < 1:
            count_length_deadspace += 1            
        if num.sum() > 0:
            empty_matrix.append(num)

    empty_matrix = np.array(empty_matrix)
    
    diff = 28 - empty_matrix.shape[0]
    zeros = np.zeros((diff, empty_matrix.shape[1]))


    if control == True:

        new_empty_matrix = np.concatenate((empty_matrix, zeros), axis=0)
    else:
        new_empty_matrix = np.concatenate((zeros, empty_matrix), axis=0)
            
    new_empty_matrix =new_empty_matrix.transpose()

    count_length_digit = 28-count_length_deadspace
    
    return new_empty_matrix, count_length_deadspace, count_length_digit

In [None]:
heights_dead_space = []
heights_digits = []
width_dead_space = []
width_digits = []
    
for item in X:
    output_1 = count_feature_attribute(item, True)
    new_item = output_1[0]
    heights_dead_space.append(output_1[1])
    heights_digits.append(output_1[2])

    output_2 = count_feature_attribute(new_item, True)
    width_dead_space.append(output_2[1])
    width_digits.append(output_2[2])

In [None]:
print(np.mean(heights_dead_space))
print(np.mean(heights_digits))
print(np.mean(width_dead_space))
print(np.mean(width_digits))
print()
print(f"sum of dead_space above and below digit should be approx: {int(100*(np.mean(heights_dead_space)/28))} % of image height")
print()
print(f"aspect ratio (length/width) digit should be approx: {np.mean(heights_digits)/np.mean(width_digits)}")

### Random Forest model performance on two different datasets (Part 2)

In [None]:
# For-looping set A and extracting the necessary 56 features. 

X_new_concat_final = []

# for item in X_new:
for item in X:

    item = item.reshape(28, 28)
    
    row_sum = []
    column_sum = []
    
    for array in item:
        row_sum.append(np.sum(array))
    
    new_matrix_transposed = item.transpose()
    
    for array in new_matrix_transposed:
        column_sum.append(np.sum(array))
    
    row_sum = np.array(row_sum)
    column_sum = np.array(column_sum)
    
    row_sum = row_sum.reshape(1, 28)[0]
    column_sum = column_sum.reshape(1, 28)[0]
    
    # X_new_concat = np.concatenate((item, row_sum), axis=None)
    # X_new_concat = np.concatenate((X_new_concat, column_sum), axis=None)
    X_new_concat = row_sum
    X_new_concat = np.concatenate((X_new_concat, column_sum), axis=None)
    # X_new_concat = np.concatenate((X_new_concat, row_sum[::-1]), axis=None)
    # X_new_concat = np.concatenate((X_new_concat, column_sum[::-1]), axis=None)

    X_new_concat_final.append(X_new_concat)

X_new_concat_final = np.array(X_new_concat_final)

In [None]:
# Dumping data into dictionary

dict = {}

for i in range(0,(28+28)):
    
    current_list = []
    
    for index,number in enumerate(X_new_concat_final):
        
        current_list.append(number[i])  #första frame, ta första element, ny frame, ta första element.

    dict[f"x_{i}"] = current_list

In [None]:
dict["label"] = y

In [None]:
df = pd.DataFrame(dict)
 
df.head()

In [None]:
X_to_model = df.drop(['label'], axis=1).values
y_to_model = df['label'].values

#### Splitting data into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

X_train_56, X_test_56, y_train_56, y_test_56 = train_test_split(X_to_model, y_to_model, test_size=0.2, random_state=42, stratify = y)
print(X_train_56.shape)
print(X_test_56.shape)
print(y_train_56.shape)
print(y_test_56.shape)

#### Instancing and fit Random Forest Classifier models

In [None]:
# 784 features
random_forest_clf = RandomForestClassifier(n_jobs=-1, random_state=42)
# 56 features
random_forest_clf_56 = RandomForestClassifier(n_jobs=-1, random_state=42)

In [None]:
scores_random_forest_clf = cross_val_score(random_forest_clf, X_train, y_train, cv=5)
scores_random_forest_clf_56 = cross_val_score(random_forest_clf_56, X_train_56, y_train_56, cv=5)

In [None]:
print(np.mean(scores_random_forest_clf))
print(np.mean(scores_random_forest_clf_56))

#### Splitting and retraining model for the sake of Confusion Matrix

In [None]:
X_train_cm, X_val_cm, y_train_cm, y_val_cm = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify = y_train)
X_train_56_cm, X_val_56_cm, y_train_56_cm, y_val_56_cm = train_test_split(X_train_56, y_train_56, test_size=0.2, random_state=42, stratify = y_train_56)

In [None]:
random_forest_clf = RandomForestClassifier(n_jobs=-1, random_state=42)
random_forest_clf.fit(X_train_cm, y_train_cm)
random_forest_clf_56 = RandomForestClassifier(n_jobs=-1, random_state=42)
random_forest_clf_56.fit(X_train_56_cm, y_train_56_cm)

In [None]:
random_forest_clf_pred = random_forest_clf.predict(X_val_cm)
random_forest_clf_56_pred = random_forest_clf_56.predict(X_val_56_cm)

In [None]:
# These names were obtained above
target_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

cm1 = confusion_matrix(y_val_cm, random_forest_clf_pred)
cm2 = confusion_matrix(y_val_56_cm, random_forest_clf_56_pred)

fig, axs = plt.subplots(1, 2, figsize = (10, 9), layout='constrained')
ConfusionMatrixDisplay(cm1, display_labels = target_names).plot(ax=axs[0])
ConfusionMatrixDisplay(cm2, display_labels = target_names).plot(ax=axs[1])

In [None]:
random_forest_clf = RandomForestClassifier(n_jobs=-1, random_state=42)
random_forest_clf.fit(X_train, y_train)

In [None]:
random_forest_clf_pred = random_forest_clf.predict(X_test)
print(classification_report(y_test, random_forest_clf_pred, target_names=target_names))

In [None]:
joblib.dump(random_forest_clf, "model.pkl")

### Preprocessing of images (Part 3)

#### prepocessing of MNIST dataset

In [None]:
# Custom function to dynamically find background of image

def find_background(my_matrix):    
    new_matrix = my_matrix.reshape(28, 28)

    first_max = np.max(new_matrix[:2])
    last_max = np.max(new_matrix[-2:])

    return np.max([first_max,last_max])

In [None]:
# Custom function to replace digit in top right corner

def remove_dead_space(my_matrix, control):
    new_matrix = my_matrix.reshape(28, 28)
    
    empty_matrix = []
    
    for idx, num in enumerate(new_matrix):
        if num.sum() > 0:
            empty_matrix.append(num)
            
    empty_matrix = np.array(empty_matrix)
    
    diff = 28 - empty_matrix.shape[0]

    zeros = np.zeros((diff, empty_matrix.shape[1]))

    if control == True:

        new_empty_matrix = np.concatenate((empty_matrix, zeros), axis=0)
    else:
        new_empty_matrix = np.concatenate((zeros, empty_matrix), axis=0)
            
    new_empty_matrix =new_empty_matrix.transpose()
        
    return new_empty_matrix
    

In [None]:
# Looping through MNIST data to put digit in top right corner
X_new = []
    
for item in X:
#     print(len(item))
    new_item = remove_dead_space(item, True)
    X_new.append(remove_dead_space(new_item, False))

In [None]:
# Normalize MNIST data
X_new = np.array(X_new)
X_new = X_new.reshape(-1,784)
X_new = X_new / 255 #highest datapoint is always 254-255 in MNIST.

In [None]:
# Checking the results after 
some_digit = X_new[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)
# plt.axis("off")

In [None]:
print(X_new[0])

### Split the preprocessed MNIST dataset, train the model and check generalization error. Finally saving the model locally.

In [None]:
X_train_pp, X_test_pp, y_train_pp, y_test_pp = train_test_split(X_new, y, test_size=0.2, random_state=42, stratify = y)

In [None]:
random_forest_clf = RandomForestClassifier(n_jobs=-1, random_state=42)
random_forest_clf.fit(X_train_pp, y_train_pp)

In [None]:
random_forest_clf_pred = random_forest_clf.predict(X_test_pp)
print(classification_report(y_test_pp, random_forest_clf_pred, target_names=target_names))

In [None]:
joblib.dump(random_forest_clf, "model.pkl")

#### Preprocessing of custom images (images taken with mobile phone)

In [None]:
import os

X_test_images = []

directory = "./bilder/egna_bilder/"  # Specify the directory where the files are located

# List the files in the directory
files = os.listdir(directory)

for filename in files:
    if os.path.isfile(os.path.join(directory, filename)):
        # Read the image file using OpenCV and convert it to grayscale
        test_image = cv2.imread(os.path.join(directory, filename), cv2.IMREAD_GRAYSCALE)
        img_resized = cv2.resize(test_image, (28,28), interpolation=cv2.INTER_LINEAR)
        img_resized = cv2.bitwise_not(img_resized) #invert image
        img_resized = img_resized.reshape(-1,784)
        X_test_images.append(img_resized.reshape(-1,784))
        print(filename)

In [None]:
some_digit = X_test_images[3]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)
print(X_test_images[3])

In [None]:
# Custom function to find background dynamically

def find_background(my_matrix):    
    new_matrix = my_matrix.reshape(28, 28)

    first_max = np.max(new_matrix[:2])
    last_max = np.max(new_matrix[-2:])

    return np.max([first_max,last_max])

In [None]:
# Set background to 0 and normalize

my_X_new = []
background_list = []

for item in X_test_images:

    background = find_background(item)
    top_number = np.max(item)

    my_X_new.append(item.flatten()/np.max(item))
    
    background_list.append(background/top_number)
    
X_test_images = my_X_new

for index,item in enumerate(X_test_images):

    item[item < background_list[index]*1.05] = 0 #works equally well for smartphone and webcam images. May not work using other peoples equipment?

print(len(X_test_images))
some_digit = X_test_images[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)

print(X_test_images[0])

In [None]:
# Replace smart phone images to the top right

my_new_X = []

for item in X_test_images:
    new_item = remove_dead_space(item.reshape(-1,784), True)
    my_new_X.append(remove_dead_space(new_item, False))

In [None]:
plt.imshow(my_new_X[0], cmap=mpl.cm.binary)

In [None]:
# Custom function for smearing a digit onto the "canvas". To make the image thicker.

def make_thicker(my_matrix):

    thick_matrix = []
    
    for item in my_matrix:
# for item in my_new_X[:1]:

    # print(item)

        zeros = np.zeros((1,28))
    
        move_down = item[:-1]
        one_move_down = np.concatenate((zeros,move_down), axis=0)
        together_down = item+one_move_down
    
        together_down = together_down.transpose()
        # print(len(together_down))
        move_down = together_down[1:]
        # print(len(move_left))
    
        # print(len(zeros))
        
        one_move_left = np.concatenate((move_down,zeros), axis=0)
        # print(len(one_move_left))
        together_left = together_down+one_move_left

        thick_matrix.append(together_left.transpose())

    # print(together_left.shape)
    # print(together_left)

    # plt.imshow(together_left, cmap=mpl.cm.binary)
    
    return thick_matrix

In [None]:
# make digits thicker

my_thick_X = make_thicker(my_new_X)

my_X_new = []

for item in my_thick_X:

    my_X_new.append(item.flatten()/np.max(item)) 

my_thick_X = my_X_new

plt.imshow(my_thick_X[3].reshape(28,28), cmap=mpl.cm.binary)

#### Predicting Smartphone images

In [None]:
my_clf = joblib.load("model.pkl")
print()

for item in my_thick_X:

    item = item.reshape(-1, 784)
    print(f"Model predicted: {my_clf.predict(item)[0]}")
#     print(item[0])
#     print()
    some_digit_image = item.reshape(28, 28)
    plt.imshow(some_digit_image, cmap=mpl.cm.binary)
    plt.show()