In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
import joblib
import numpy as np
from keras.datasets import mnist
from keras.layers import Dense, Flatten
from keras.layers import Conv2D
from keras.models import Sequential
from keras.utils import to_categorical

In [None]:
# Note:
# Machine learning models such as SVC and RF use
# input as a 2D arrow with samples and flattened features
# which is why we import a different dataset for this compared to
# neural network that you will see later on in the code.

# They are the same dataset just different formats that are applicable to the model

In [None]:
# Load in data
# Download at: https://www.kaggle.com/datasets/oddrationale/mnist-in-csv
train = pd.read_csv("/content/mnist_train.csv")
test = pd.read_csv("/content/mnist_test.csv")

X_train = train.drop('label', axis=1)
y_train = train['label']
X_test = test.drop('label', axis=1)
y_test = test['label']

In [None]:
# Print the shape of X and y
print("Shape of X:", train.shape)
print("Shape of y:", test.shape)

Shape of X: (60000, 785)
Shape of y: (10000, 785)


In [None]:
# Check if there are any missing values

train.isnull().sum().sort_values(ascending=False)

# Don't need to do anything because they are just pixel values

label    0
19x12    0
19x14    0
19x15    0
19x16    0
        ..
10x12    0
10x13    0
10x14    0
10x15    0
28x28    0
Length: 785, dtype: int64

In [None]:
# Preprocess Data
X_train = (np.asarray(X_train)/255)
X_test = (np.asarray(X_test)/255)
# values are 0-255 for pixels so normalizing them to be between 0-1 can speed up training
# & accuracy because of less complexity

In [None]:
# Define and train random forest model
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)

random_forest.fit(X_train, y_train)
random_forest_predictions = random_forest.predict(X_test)
random_forest_accuracy = accuracy_score(y_test, random_forest_predictions)
random_forest_f1 = f1_score(y_test, random_forest_predictions, average='macro')

In [None]:
# Define and train SVM model
svc = SVC(random_state=0, probability=True)

# NOTE: This model will take around 20 minutes to train
svc.fit(X_train, y_train)
svc_predictions = svc.predict(X_test)
svc_accuracy = accuracy_score(y_test, svc_predictions)
svc_f1 = f1_score(y_test, svc_predictions, average='macro')

In [None]:
# Analyze Results
results_df = pd.DataFrame({
    'Model': ['Random Forest', 'SVC'],
    'Accuracy': [random_forest_accuracy, svc_accuracy],
    'F1 Score': [random_forest_f1, svc_f1]
})

print(results_df)

           Model  Accuracy  F1 Score
0  Random Forest    0.9705  0.970235
1            SVC    0.9792  0.979130


In [None]:
# Save models to be used in the backend code for predictions
filename_rf= "joblib_rf.sav"
joblib.dump(random_forest, filename_rf)

filename_svc = "joblib_svc.sav"
joblib.dump(svc, filename_svc)

['joblib_svc.sav']

In [None]:
# load in data as a numpy array 3D array suitable for neural networks
(X_train, y_train), (X_test, y_test) = mnist.load_data()

print("Shape of data before reshape: \n")
print("Shape of X train:", X_train.shape)
print("Shape of y train:", y_train.shape)
print("Shape of X test:", X_test.shape)
print("Shape of y test:", y_test.shape)

# Preprocess data to work with input

# Convert to greyscale for compatability with neural network input
X_train = np.asarray(X_train).reshape(60000, 28, 28, 1)
X_test = np.asarray(X_test).reshape(10000, 28, 28, 1)

# Additional preprocessing
X_train = (np.asarray(X_train)/255)
X_test = (np.asarray(X_test)/255)
# values are 0-255 for pixels so normalizing them to be between 0-1 can speed up training
# & accuracy because of less complexity

print("\nShape of data after reshape: \n")
print("Shape of X train:", X_train.shape)
print("Shape of X test:", X_test.shape)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

Shape of data before reshape: 

Shape of X train: (60000, 28, 28)
Shape of y train: (60000,)
Shape of X test: (10000, 28, 28)
Shape of y test: (10000,)

Shape of data after reshape: 

Shape of X train: (60000, 28, 28, 1)
Shape of X test: (10000, 28, 28, 1)


In [None]:
# NOTE: This model will take around 15 minutes to train
# Declare the model
model = Sequential()

# Declare the layers with specified names
layer_1 = Conv2D(32, kernel_size=3, activation="relu", input_shape=(28, 28, 1), name='conv0')
layer_2 = Conv2D(32, kernel_size=3, activation="relu", name='conv1')
layer_3 = Conv2D(64, kernel_size=3, activation="relu", name='conv2')
layer_4 = Conv2D(64, kernel_size=3, activation="relu", name='conv3')
layer_5 = Flatten(name='flatten_1')
layer_6 = Dense(254, activation="relu", name='fc1')
layer_7 = Dense(10, activation="softmax", name='fco')

# Add the layers to the model
model.add(layer_1)
model.add(layer_2)
model.add(layer_3)
model.add(layer_4)
model.add(layer_5)
model.add(layer_6)
model.add(layer_7)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
import pandas as pd

# Create a DataFrame to store accuracy during training and validation
accuracy_df = pd.DataFrame(history.history)
# Analyze Results
print(accuracy_df)

# Save the model
model.save('digit_classifier.h5')

       loss  accuracy  val_loss  val_accuracy
0  0.096192  0.970117  0.045796        0.9857
1  0.035676  0.988967  0.035143        0.9892
2  0.022960  0.993050  0.035878        0.9888


  saving_api.save_model(
