In [6]:
import collections
import numpy as np

In [7]:
#read files
text_test = []
text_train = []
labels_train = []
labels_test = []

# read the contents of text_test.txt and store it in text_test list
with open('text_test.txt', 'r', encoding='ISO-8859-1') as file:
    for line in file:
        text_test.append(line.strip())

# read the contents of text_train.txt and store it in text_train list
with open('text_train.txt', 'r', encoding='ISO-8859-1') as file:
    for line in file:
        text_train.append(line.strip())

# read the contents of labels_train.txt and store it in labels_train list
with open('label_train.txt', 'r', encoding='ISO-8859-1') as file:
    for line in file:
        labels_train.append(line.strip())

# read the contents of labels_test.txt and store it in labels_test list
with open('label_test.txt', 'r',encoding='ISO-8859-1') as file:
    for line in file:
        labels_test.append(line.strip())


In [8]:
#text_test
#text_train
#labels_test
#labels_train

In [9]:
def train_naive_bayes(text_train, labels_train):
    # Create a vocabulary of all words in the text_train data
    vocabulary = set()
    for document in text_train:
        vocabulary |= set(document.split())
    vocabulary = list(vocabulary)

    # Initialize a dictionary to store the word count for each class
    class_word_count = {c: collections.defaultdict(int) for c in set(labels_train)}

    # Count the number of words in each document for each class
    for document, label in zip(text_train, labels_train):
        for word in document.split():
            class_word_count[label][word] += 1

    # Compute the class probabilities
    class_probabilities = dict(collections.Counter(labels_train))
    for label in class_probabilities:
        class_probabilities[label] /= len(labels_train)

    # Compute the word probabilities for each class
    word_probabilities = {c: dict() for c in set(labels_train)}
    for label in class_word_count:
        total_word_count = sum(class_word_count[label].values())
        for word in vocabulary:
            word_probabilities[label][word] = (class_word_count[label][word] + 1) / (total_word_count + len(vocabulary))

    return vocabulary, class_probabilities, word_probabilities

def predict_naive_bayes(text, vocabulary, class_probabilities, word_probabilities):
    # Compute the class probabilities for each document
    document_probabilities = {c: np.log(class_probabilities[c]) for c in class_probabilities}
    for word in text.split():
        if word in vocabulary:
            for label in word_probabilities:
                document_probabilities[label] += np.log(word_probabilities[label][word])
    return max(document_probabilities, key=lambda c: document_probabilities[c])

# Train the Naive Bayes classifier
vocabulary, class_probabilities, word_probabilities = train_naive_bayes(text_train, labels_train)

# Predict the class labels for the text_test data
predicted_labels = [predict_naive_bayes(text, vocabulary, class_probabilities, word_probabilities) for text in text_test]


In [11]:
def calculate_accuracy(predicted_labels, actual_labels):
    return sum(predicted == actual for predicted, actual in zip(predicted_labels, actual_labels)) / len(actual_labels)

# Predict the class labels for the text_test data
predicted_labels = [predict_naive_bayes(text, vocabulary, class_probabilities, word_probabilities) for text in text_test]

# Calculate the accuracy of the classifier
accuracy = calculate_accuracy(predicted_labels, labels_test)
print("Accuracy:", accuracy*100)


Accuracy: 76.58975801913337
