In [1]:
# load the dataset
# instead of putting dataset in folder I directly imported in from sklearn

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the dataset
newsgroups_data = fetch_20newsgroups(subset='all')
X = newsgroups_data.data
y = newsgroups_data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text to feature vectors
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Transform counts to tf-idf representation
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)


In [2]:
# using in-built naive bayes
# Initialize the Multinomial Naive Bayes classifier from sklearn
clf_sklearn = MultinomialNB()

# Train the model
clf_sklearn.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_sklearn = clf_sklearn.predict(X_test_tfidf)

# Check accuracy
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"Sklearn Multinomial Naive Bayes Accuracy: {accuracy_sklearn * 100:.2f}%")

Sklearn Multinomial Naive Bayes Accuracy: 84.75%


In [3]:
import numpy as np

# Custom implementation of Naive Bayes Classifier
class NaiveBayesFromScratch:
    def __init__(self):
        # Placeholder for prior probabilities for each class (P(class))
        self.class_probabilities = None
        # Placeholder for likelihood probabilities for each feature given a class (P(feature|class))
        self.feature_probabilities = None
        # Placeholder for unique class labels
        self.classes = None

    def fit(self, X, y):
        """
        Fit the Naive Bayes model using training data.
        Parameters:
        - X: training data (document-term matrix)
        - y: training labels (class labels for each document)
        """
        # Get the number of samples (documents) and the number of features (words)
        n_samples, n_features = X.shape

        # Get the unique class labels (e.g., 20 different newsgroups)
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        # Initialize arrays to store the prior and likelihood probabilities
        # Prior probabilities: P(class) for each class
        self.class_probabilities = np.zeros(n_classes)

        # Likelihood probabilities: P(feature|class) for each feature (word) and class
        # Shape: (number of classes, number of features)
        self.feature_probabilities = np.zeros((n_classes, n_features))

        # Loop over each class and calculate probabilities
        for idx, cls in enumerate(self.classes):
            # Get all samples (documents) that belong to the current class
            X_class = X[y == cls]

            # Calculate prior probability P(class) as the proportion of documents in this class
            self.class_probabilities[idx] = X_class.shape[0] / float(n_samples)

            # Calculate likelihood P(feature|class) using Laplace smoothing:
            # (count of word in class + 1) / (total words in class + number of features)
            self.feature_probabilities[idx, :] = (X_class.sum(axis=0) + 1) / (X_class.sum() + n_features)

    def predict(self, X):
        """
        Predict the class labels for a given test dataset.
        Parameters:
        - X: test data (document-term matrix)
        Returns:
        - y_pred: predicted class labels for the test data
        """
        y_pred = []  # List to store the predicted labels for each test document

        # Loop over each sample in the test data
        for x in X:
            posteriors = []  # List to store posterior probabilities for each class

            # For each class, calculate the posterior probability P(class|document)
            for idx, cls in enumerate(self.classes):
                # Log of prior probability P(class)
                log_prior = np.log(self.class_probabilities[idx])

                # Log of likelihood P(document|class): Sum of log(P(word|class)) for all words in the document
                log_likelihood = np.sum(x.toarray() * np.log(self.feature_probabilities[idx]))

                # Posterior = log(P(class)) + log(P(document|class))
                posteriors.append(log_prior + log_likelihood)

            # Choose the class with the highest posterior probability
            y_pred.append(self.classes[np.argmax(posteriors)])

        return np.array(y_pred)  # Return the predicted class labels as an array


# Initialize the custom Naive Bayes classifier
nb_scratch = NaiveBayesFromScratch()

# Train the model using the training data (counts of words in documents)
nb_scratch.fit(X_train_counts, y_train)

# Predict class labels for the test data
y_pred_scratch = nb_scratch.predict(X_test_counts)

# Check accuracy of the custom implementation
accuracy_scratch = accuracy_score(y_test, y_pred_scratch)
print(f"Custom Naive Bayes Accuracy: {accuracy_scratch * 100:.2f}%")


Custom Naive Bayes Accuracy: 85.12%


In [4]:
# Printing custom naive bayes accuracy and in-built naive bayes accuracy
print(f"Sklearn Naive Bayes Accuracy: {accuracy_sklearn * 100:.2f}%")
print(f"Custom Naive Bayes Accuracy: {accuracy_scratch * 100:.2f}%")

Sklearn Naive Bayes Accuracy: 84.75%
Custom Naive Bayes Accuracy: 85.12%
