In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [9]:
class NaiveBayesTextClassifier:
    def __init__(self):
        self.class_probs = {}
        self.word_probs = {}
        self.vocabulary = set()
        self.classes = None

    def fit(self, X, y):
        self.classes = y.unique()
        
        # Calculate class probabilities
        class_counts = y.value_counts()
        total_count = len(y)
        self.class_probs = {cls: count / total_count for cls, count in class_counts.items()}
        
        # Initialize word probabilities
        self.word_probs = {cls: {} for cls in self.classes}
        self.vocabulary = set()
        
        for cls in self.classes:
            subset = X[y == cls]
            words = ' '.join(subset).split()
            self.vocabulary.update(words)
            word_counts = pd.Series(words).value_counts()
            total_words = len(words)
            for word in self.vocabulary:
                # Applying Laplace smoothing
                self.word_probs[cls][word] = (word_counts.get(word, 0) + 1) / (total_words + len(self.vocabulary))

    def predict(self, X):
        predictions = []
        for text in X:
            words = text.split()
            class_probs = {}
            for cls in self.classes:
                prob = self.class_probs[cls]
                for word in words:
                    prob *= self.word_probs[cls].get(word, 1 / (sum(self.word_probs[cls].values()) + len(self.vocabulary)))
                class_probs[cls] = prob
            predicted_class = max(class_probs, key=class_probs.get)
            predictions.append(predicted_class)
        return predictions

df = pd.read_csv('Lab7_2.csv')

# Train the classifier
nb_classifier = NaiveBayesTextClassifier()
nb_classifier.fit(df['Text'], df['Tag'])

# Predict on the entire dataset (since we are not splitting into train/test)
y_true = df['Tag']
y_pred = nb_classifier.predict(df['Text'])
# print(y_pred)


# # Predict the class for a new sentence
test_sentence = ["A very close game"]
predicted_tag = nb_classifier.predict(test_sentence)
# print(predicted_tag)
print(f'The sentence "A very close game" is classified as: {predicted_tag[0]}')


The sentence "A very close game" is classified as: 'Sports'
