In [None]:
# Dataset basis
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Tools
from sklearn.feature_extraction.text import TfidfVectorizer # Tokenizes sentences

# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

We must first fetch the datasets that will be used in the models.

In [None]:
# 1) Load MNIST
print("Loading MNIST...")

# 2) Load EMNIST
print("Loading EMNIST...")
mnist = fetch_openml("EMNIST_Balanced", version=1, as_frame=False)

# 3) Load IMDB_Sentiment
print("Loading IMDB_Sentiment...")
dataset = load_dataset("Kwaai/IMDB_Sentiment", split="train").shuffle(seed=42)

Prepare The MNIST Dataset

Prepare The EMNIST Dataset

In [None]:
# 1) Normalize EMNIST Balanced
X, y = mnist.data, mnist.target.astype(int)

# Normalize pixel values to [0, 1]
X = X / 255.0

# Optional: Apply PCA to reduce dimensionality
USE_PCA = False
if USE_PCA:
    pca = PCA(n_components=100)  # Try 50–150
    X = pca.fit_transform(X)

# 2) Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

Prepare The IMDB_Sentiment Dataset

In [None]:
# Show a sample text
print(dataset[0]["text"])

# Select a smaller subset for faster training
dataset = dataset.select(range(1000))

# Extract text and labels
texts = dataset["text"]
labels = dataset["label"]

# TF-IDF Vectorization; Tokenizes
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(texts)
y = np.array(dataset["label"])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

There will be 3 main models we will be using between these 3 differing datasets, those include K Nearest Neighbors, Naive Bayes, and Logistic Regression.

In [None]:
#================# KNN #=================#
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"K Nearest Neighbors' Accuracy: {accuracy_knn:.2f}")
#========================================#

In [None]:
#============# Naive Bayes #=============#
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred_nb = nb_classifier.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes' Accuracy: {accuracy_nb:.4f}")
#========================================#

In [None]:
#=========# Logistic Regression #========#
logreg_classifier = LogisticRegression(max_iter=1000)
logreg_classifier.fit(X_train, y_train)
y_pred_logreg = logreg_classifier.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression's Accuracy: {accuracy_logreg:.4f}")
#========================================#