In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import StandardScaler, Binarizer
from sklearn.metrics import accuracy_score

In [2]:
# Load the data
data = np.load('data.npz')
X_train = data['X_train']  # Shape: (82875, 784)
y_train = data['y_train']  # Shape: (82875,)
X_test = data['X_test']  # (14625, 784)

In [3]:
# Split the data into a training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=20)

In [4]:
X_train_scaled = X_train/255
X_val_scaled = X_val / 255

In [5]:
# Threshold
binarizer = Binarizer(threshold=0)
X_train_binarized = binarizer.fit_transform(X_train_scaled)
X_val_binarized = binarizer.transform(X_val_scaled)

In [6]:
# Initialize and train Bernoulli Naive Bayes
berNB = BernoulliNB()
berNB.fit(X_train_binarized, y_train)

# Predict on the training set
y_train_pred = berNB.predict(X_train_binarized)

# Calculate the accuracy on the training set
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy * 100:.2f}%')


Training Accuracy: 61.97%


In [7]:
# Predict on the validation set
y_val_pred = berNB.predict(X_val_binarized)

# Calculate the accuracy on the validation set
validation_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {validation_accuracy * 100:.2f}%')


Validation Accuracy: 61.35%


In [8]:
# predict on the test set
X_test_scaled = X_test / 255
X_test_binarized = binarizer.transform(X_test_scaled)
y_test_pred = berNB.predict(X_test_binarized)

print(np.shape(y_test_pred))

(14625,)


In [9]:
# Save predictions 
predict_id = np.arange(0, len(X_test))
submission_predictions = np.vstack((predict_id, y_test_pred)).T

np.savetxt("naive_bayes_predictions.csv", submission_predictions, delimiter=",", fmt='%d', header="ID,Label", comments='')