In [1]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


In [2]:
from sklearn.datasets import fetch_20newsgroups

# Load the training and test data
categories = None  # You can limit categories if needed
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Initialize CountVectorizer with stop word removal
count_vect = CountVectorizer(stop_words='english', min_df=2)

# Fit and transform training data
X_train_counts = count_vect.fit_transform(newsgroups_train.data)

# Transform test data
X_test_counts = count_vect.transform(newsgroups_test.data)

# Apply TF-IDF transformation
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)


In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Train model using sklearn's Naive Bayes
clf_sklearn = MultinomialNB()
clf_sklearn.fit(X_train_tfidf, newsgroups_train.target)

# Make predictions
y_pred_sklearn = clf_sklearn.predict(X_test_tfidf)

# Print classification report
print(classification_report(newsgroups_test.target, y_pred_sklearn))


              precision    recall  f1-score   support

           0       0.74      0.20      0.32       319
           1       0.65      0.69      0.67       389
           2       0.66      0.60      0.63       394
           3       0.61      0.74      0.67       392
           4       0.78      0.68      0.72       385
           5       0.81      0.76      0.79       395
           6       0.78      0.78      0.78       390
           7       0.81      0.73      0.77       396
           8       0.85      0.75      0.80       398
           9       0.91      0.80      0.85       397
          10       0.57      0.93      0.71       399
          11       0.64      0.79      0.70       396
          12       0.71      0.53      0.61       393
          13       0.88      0.76      0.81       396
          14       0.76      0.74      0.75       394
          15       0.38      0.92      0.54       398
          16       0.57      0.72      0.64       364
          17       0.82    

In [9]:
import numpy as np
import math

class NaiveBayesFromScratch:
    def __init__(self):
        self.class_probs = {}
        self.word_probs = {}
        self.vocab = set()

    def fit(self, X, y):
        # Calculate class probabilities and word frequencies per class
        self.classes = np.unique(y)
        word_count = {}
        class_count = {}

        for c in self.classes:
            class_count[c] = np.sum(y == c)
            word_count[c] = {}

        total_words_in_class = {c: 0 for c in self.classes}

        for i in range(len(y)):
            label = y[i]
            words = X[i].split()  # Assuming X[i] is preprocessed text
            for word in words:
                self.vocab.add(word)
                if word not in word_count[label]:
                    word_count[label][word] = 1
                else:
                    word_count[label][word] += 1
                total_words_in_class[label] += 1

        # Calculate probabilities
        for c in self.classes:
            self.class_probs[c] = math.log(class_count[c] / len(y))
            self.word_probs[c] = {}
            for word in self.vocab:
                word_count_c = word_count[c].get(word, 0) + 1  # Laplace smoothing
                self.word_probs[c][word] = math.log(word_count_c / (total_words_in_class[c] + len(self.vocab)))

    def predict(self, X):
        predictions = []
        for doc in X:
            doc_words = doc.split()  # Assuming X contains preprocessed text
            class_scores = {}
            for c in self.classes:
                class_scores[c] = self.class_probs[c]
                for word in doc_words:
                    if word in self.vocab:
                        class_scores[c] += self.word_probs[c].get(word, math.log(1 / (len(self.vocab))))
            predictions.append(max(class_scores, key=class_scores.get))
        return predictions



In [10]:
# Train custom Naive Bayes model
clf_custom = NaiveBayesFromScratch()
clf_custom.fit(newsgroups_train.data, newsgroups_train.target)

# Make predictions
y_pred_custom = clf_custom.predict(newsgroups_test.data)

# Evaluate
print(classification_report(newsgroups_test.target, y_pred_custom))


              precision    recall  f1-score   support

           0       0.69      0.07      0.13       319
           1       0.68      0.34      0.46       389
           2       0.52      0.03      0.06       394
           3       0.54      0.52      0.53       392
           4       0.79      0.23      0.35       385
           5       0.48      0.78      0.60       395
           6       0.90      0.45      0.60       390
           7       0.85      0.21      0.33       396
           8       0.89      0.15      0.25       398
           9       0.93      0.27      0.42       397
          10       0.47      0.56      0.51       399
          11       0.26      0.72      0.39       396
          12       0.73      0.18      0.29       393
          13       0.80      0.45      0.57       396
          14       0.72      0.41      0.53       394
          15       0.23      0.92      0.36       398
          16       0.51      0.23      0.32       364
          17       0.21    

In [11]:
from sklearn.metrics import classification_report, accuracy_score

# Sklearn Naive Bayes Predictions
y_pred_sklearn = clf_sklearn.predict(X_test_tfidf)

# Print classification report and accuracy for Sklearn implementation
print("=== Sklearn Multinomial Naive Bayes ===")
print(classification_report(newsgroups_test.target, y_pred_sklearn))
accuracy_sklearn = accuracy_score(newsgroups_test.target, y_pred_sklearn)
print(f"Sklearn Naive Bayes Accuracy: {accuracy_sklearn}")

# Custom Naive Bayes Predictions
y_pred_custom = clf_custom.predict(newsgroups_test.data)

# Print classification report and accuracy for custom implementation
print("\n=== Custom Naive Bayes Implementation ===")
print(classification_report(newsgroups_test.target, y_pred_custom))
accuracy_custom = accuracy_score(newsgroups_test.target, y_pred_custom)
print(f"Custom Naive Bayes Accuracy: {accuracy_custom}")

# Compare results
print("\n=== Comparison of Results ===")
print(f"Sklearn Naive Bayes Accuracy: {accuracy_sklearn}")
print(f"Custom Naive Bayes Accuracy: {accuracy_custom}")

if abs(accuracy_sklearn - accuracy_custom) < 0.05:
    print("Both implementations have approximately the same accuracy.")
else:
    print("There is a significant difference between the two implementations.")


=== Sklearn Multinomial Naive Bayes ===
              precision    recall  f1-score   support

           0       0.74      0.20      0.32       319
           1       0.65      0.69      0.67       389
           2       0.66      0.60      0.63       394
           3       0.61      0.74      0.67       392
           4       0.78      0.68      0.72       385
           5       0.81      0.76      0.79       395
           6       0.78      0.78      0.78       390
           7       0.81      0.73      0.77       396
           8       0.85      0.75      0.80       398
           9       0.91      0.80      0.85       397
          10       0.57      0.93      0.71       399
          11       0.64      0.79      0.70       396
          12       0.71      0.53      0.61       393
          13       0.88      0.76      0.81       396
          14       0.76      0.74      0.75       394
          15       0.38      0.92      0.54       398
          16       0.57      0.72      0.