In [4]:
import os
from sklearn.feature_extraction.text import CountVectorizer
import re
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score


dataset_path = os.path.join(os.getcwd(), 'dataset-news')
content_list = []
label_list = []
for file in os.listdir(dataset_path):
    file_path = os.path.join(dataset_path, file)
    if os.path.isfile(file_path):
        # get the prefix of the file name as the target
        prefix = re.match(r"^[^\d]+", file)
        label_list.append(prefix.group())
        # read file content
        with open(file_path, 'r') as f:
            content = f.read()
            content_list.append(content)

count_vect = CountVectorizer(stop_words='english')
X = count_vect.fit_transform(content_list).toarray()

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(np.array(label_list))

kf = KFold(n_splits=10)
gnb = GaussianNB()
mnb = MultinomialNB()
gnb_accuracies = []
mnb_accuracies = []
# use 10-fold cross validation
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    gnb.fit(X_train, y_train)
    mnb.fit(X_train, y_train)
    y_pred_gnb = gnb.predict(X_test)
    y_pred_mnb = mnb.predict(X_test)
    gnb_accuracies.append(accuracy_score(y_test, y_pred_gnb))
    mnb_accuracies.append(accuracy_score(y_test, y_pred_mnb))

print(f'GaussianNB accuracy: {np.mean(gnb_accuracies):.2f}')
print(f'MultinomialNB accuracy: {np.mean(mnb_accuracies):.2f}')


GaussianNB accuracy: 0.72
MultinomialNB accuracy: 0.79


**(a) Answer**

GaussianNB accuracy: 0.72

MultinomialNB accuracy: 0.79

The multinomial Naive Bayes classifier performs better than the Gaussian Naive Bayes classifier. The multinomial Naive Bayes classifier is more suitable for text classification tasks, as it models the frequency of words in the document. The Gaussian Naive Bayes classifier assumes that the features are normally distributed, which may not be the case for text data. Therefore, the multinomial Naive Bayes classifier is more appropriate for text classification tasks.

In [2]:
# random classifier
true_labels = y
random_labels = np.random.randint(0, 8, 800)
accuracy = accuracy_score(true_labels, random_labels)
print(f'Random Classifier Accuracy: {accuracy:.2f}')

Random Classifier Accuracy: 0.13


**(b) Answer**

Random Classifier Accuracy: 0.13

The performance of the random classifier is very poor, which is expected. The random classifier assigns a random label to each instance. As there are 8 labels in total, the probability of assigning the correct label to an instance should be around 1/8 = 0.125.

In [5]:
# Do not remove the stop words
count_vect = CountVectorizer()
X = count_vect.fit_transform(content_list).toarray()

for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    mnb.fit(X_train, y_train)
    y_pred_mnb = mnb.predict(X_test)
    mnb_accuracies.append(accuracy_score(y_test, y_pred_mnb))

print(f'MultinomialNB accuracy: {np.mean(mnb_accuracies):.2f}')

MultinomialNB accuracy: 0.76


**(c) Answer**

MultinomialNB accuracy with stopwords: 0.76

The accuracy of the Multinomial Naive Bayes classifier decreases when stop words are not removed. Stop words are common words that do not carry much information about the content of the text, such as "the", "is", "and", etc. By removing stop words, we can reduce the dimensionality of the feature space and focus on more meaningful words that can help in classification. When stop words are not removed, the classifier may be influenced by these common words, leading to a decrease in accuracy.

In [6]:
def get_task_documents(topics):
    task_content_list = []
    task_label_list = []
    for i in range(0, len(content_list)):
        if label_list[i] in topics:
            task_content_list.append(content_list[i])
            task_label_list.append(label_list[i])
    return task_content_list, task_label_list


def evaluate_task(topics):
    task_content_list, task_label_list = get_task_documents(topics)
    count_vect = CountVectorizer(stop_words='english')
    X = count_vect.fit_transform(task_content_list).toarray()
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(np.array(task_label_list))
    for train, test in kf.split(X):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        mnb.fit(X_train, y_train)
        y_pred_mnb = mnb.predict(X_test)
        mnb_accuracies.append(accuracy_score(y_test, y_pred_mnb))
    return np.mean(mnb_accuracies)

task1_topics = ["pol-guns", "hockey", "mac-hw"]
task2_topics = ["mac-hw", "ibm-hw", "electronics"]
print(f'Task 1 Accuracy: {evaluate_task(task1_topics):.2f}')
print(f'Task 2 Accuracy: {evaluate_task(task2_topics):.2f}')

Task 1 Accuracy: 0.82
Task 2 Accuracy: 0.79


**(d) Answer**

Task 1 Accuracy: 0.82

Task 2 Accuracy: 0.79

The accuracy of the Multinomial Naive Bayes classifier is higher for Task 1 compared to Task 2. This indicates that the classifier performs better on documents related to the topics "use of guns", "hockey", and "Mac hardware" compared to the topics "Mac hardware", "IBM hardware", and "electronics".

Task 1 achieves higher accuracy due to clear topic distinctions. The vocabulary overlap is minimal, so the classifier can distinguish classes easily. However, task 2 has lower accuracy because of overlapping vocabulary and semantics among the topics. The topics are related. Significant vocabulary overlap (e.g., terms like “hardware”, “device”) may confuse the classifier.