*Bonus question - for bonus marks* We will now look at a more challenging text-based classification
problem, namely to classify a page from a Harry Potter book into which of the seven books the page was
taken from. The books can be found in the zip file hp_books.zip and are text files where each page of a
given book is a line in the text file. Note, all punctuation and capital letters have been removed from the
file, so that only the words of the page remain to be used by our model.

(a) Train an NB model using 80% of the data to train and the remaining 20% as test data. Use Laplace
smoothing for your model. Report a confusion matrix of your results.

In [11]:
import os
import random
from collections import defaultdict
from sklearn.metrics import confusion_matrix
import math

#Read data from text files
data_dir = "hp_books"
book_pages = defaultdict(list)
for book_file in os.listdir(data_dir):
    with open(os.path.join(data_dir, book_file), "r") as f:
        for line in f:
            book_pages[book_file].append(line.strip())

#Splitting data into sets
train_pages = []
test_pages = []
for book in book_pages:
    num_pages = len(book_pages[book])
    split_index = int(num_pages * 0.8) # Use 80% of data for training
    train_pages += [(page, book) for page in book_pages[book][:split_index]]
    test_pages += [(page, book) for page in book_pages[book][split_index:]]

vocabulary = set()
class_counts = defaultdict(int)
for page, book in train_pages:
    class_counts[book] += 1
    vocabulary.update(page.split())
num_classes = len(class_counts)
class_priors = {book: count/len(train_pages) for book, count in class_counts.items()}

#Compute word counts for each class
word_counts = {book: defaultdict(int) for book in class_counts}
for page, book in train_pages:
    for word in page.split():
        word_counts[book][word] += 1

#Training
k = 1 
class_likelihoods = {book: defaultdict(float) for book in class_counts}
for book in word_counts:
    total_words = sum(word_counts[book].values())
    for word in vocabulary:
        count = word_counts[book][word]
        class_likelihoods[book][word] = (count + k) / (total_words + k * len(vocabulary))

y_true = [book for _, book in test_pages]
y_pred = []
for page, _ in test_pages:
    log_probs = {book: 0 for book in class_counts}
    for book in class_counts:
        log_probs[book] += math.log(class_priors[book])
        for word in page.split():
            if word in vocabulary:
                log_probs[book] += math.log(class_likelihoods[book][word])
    pred_class = max(log_probs, key=log_probs.get)
    y_pred.append(pred_class)

#Confusion Matrix
labels = list(class_counts.keys())
print_confusion_matrix = confusion_matrix(y_true, y_pred, labels=labels)
print(f"Confusion matrix:\n{print_confusion_matrix}")
#not required
accuracy = sum([1 for i in range(len(y_true)) if y_true[i] == y_pred[i]]) / len(y_true)
print(f"Accuracy: {round(accuracy*100, 4)}%")

Confusion matrix:
[[  5   0   2   8  36  11   8]
 [  0  12   1   9  20  19  15]
 [  0   0  35   2  44   4  13]
 [  0   0   0  70  36  16  40]
 [  0   0   0   4 152  16  49]
 [  0   0   0   5  28  55  58]
 [  1   0   1   4  31  17 116]]
Accuracy: 47.1898197242842%


(b) Adapt your code to use 80% of the data to train, 10% of the data as validation data and the remaining
10% as test data. Train separate NB classifiers using the values {1  * 10^-1; 1  * 10^-2; 1  * 10^-3; 1  * 10^-4; 1 * 10^-5; 1  * 10^-6} to smooth the table of likelihoods. Train each model using the training data,
and track its performance on the validation data. 

In [27]:
import os
import random
from collections import defaultdict
from sklearn.metrics import confusion_matrix
import math

# Read data from text files
data_dir = "hp_books"
book_pages = defaultdict(list)
for book_file in os.listdir(data_dir):
    with open(os.path.join(data_dir, book_file), "r") as f:
        for line in f:
            book_pages[book_file].append(line.strip())

train_pages = []
val_pages = []
test_pages = []
for book in book_pages:
    num_pages = len(book_pages[book])
    train_split_index = int(num_pages * 0.8)  # Use 80% of data for training
    val_split_index = int(num_pages * 0.9)  # Use 10% of data for validation
    train_pages += [(page, book) for page in book_pages[book][:train_split_index]]
    val_pages += [(page, book) for page in book_pages[book][train_split_index:val_split_index]]
    test_pages += [(page, book) for page in book_pages[book][val_split_index:]]

vocabulary = set()
class_counts = defaultdict(int)
for page, book in train_pages:
    class_counts[book] += 1
    vocabulary.update(page.split())
num_classes = len(class_counts)

total_train_pages = len(train_pages)
class_priors = {book: count / total_train_pages for book, count in class_counts.items()}
smoothing_vals = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
best_accuracy = 0
best_conf_mat = None
#track performance on validation data
val_performance = []
for k in smoothing_vals:
    word_counts = {book: defaultdict(int) for book in class_counts}
    for page, book in train_pages:
        for word in page.split():
            word_counts[book][word] += 1

    class_likelihoods = {book: defaultdict(float) for book in class_counts}
    for book in word_counts:
        total_words = sum(word_counts[book].values())
        for word in vocabulary:
            count = word_counts[book][word]
            class_likelihoods[book][word] = (count + k) / (total_words + k * len(vocabulary))

    y_true = [book for _, book in val_pages]
    y_pred = []
    for page, _ in val_pages:
        log_probs = {book: 0 for book in class_counts}
        for book in class_counts:
            log_probs[book] += math.log(class_priors[book])
            for word in page.split():
                if word in vocabulary:
                    log_probs[book] += math.log(class_likelihoods[book][word])
        pred_class = max(log_probs, key=log_probs.get)
        y_pred.append(pred_class)

    accuracy = round((sum([1 for i in range(len(y_true)) if y_true[i] == y_pred[i]]) / len(y_true)), 8)
    val_performance.append(accuracy)
    labels = list(class_counts.keys())
    print_confusion_matrix = confusion_matrix(y_true, y_pred, labels=labels)
for i in range(len(val_performance)):
    val_performance[i] = "{:.4f}%".format(val_performance[i] * 100)
print(f"Confusion matrix:\n{print_confusion_matrix}")
print(f"Smoothing values: {smoothing_vals}\nAccuracy on Validation Performance: {val_performance}")


Confusion matrix:
[[ 7  1  4  4 17  2  0]
 [ 0  9  2  3 14  3  7]
 [ 0  0 24  1  9  3 12]
 [ 0  0  0 31 19  8 23]
 [ 0  0  0  4 96  1  9]
 [ 0  0  0  7 18 26 22]
 [ 0  1  2  3 16  9 54]]
Smoothing values: [0.1, 0.01, 0.001, 0.0001, 1e-05, 1e-06]
Accuracy on Validation Performance: ['65.1805%', '63.4820%', '59.2357%', '56.2633%', '53.9278%', '52.4416%']


(c) Use the model which achieved the best validation accuracy and test it using the test data set. Report a confusion matrix of the results, as well as the test accuracy of the model.

In [28]:
import os
import random
from collections import defaultdict
from sklearn.metrics import confusion_matrix
import math

#Read data from text files
data_dir = "hp_books"
book_pages = defaultdict(list)
for book_file in os.listdir(data_dir):
    with open(os.path.join(data_dir, book_file), "r") as f:
        for line in f:
            book_pages[book_file].append(line.strip())

train_pages = []
val_pages = []
test_pages = []
for book in book_pages:
    num_pages = len(book_pages[book])
    train_split_index = int(num_pages * 0.8)  #Use 80% of data for training
    val_split_index = int(num_pages * 0.9)  #Use 10% of data for validation, just doing this based off the previous question, dont think we need to do 80% training and 20% test data
    train_pages += [(page, book) for page in book_pages[book][:train_split_index]]
    val_pages += [(page, book) for page in book_pages[book][train_split_index:val_split_index]]
    test_pages += [(page, book) for page in book_pages[book][val_split_index:]]

vocabulary = set()
class_counts = defaultdict(int)
for page, book in train_pages:
    class_counts[book] += 1
    vocabulary.update(page.split())
num_classes = len(class_counts)

total_train_pages = len(train_pages)
class_priors = {book: count / total_train_pages for book, count in class_counts.items()}

#Train using the (1*10^-1) smoothing value
k = 1e-1
word_counts = {book: defaultdict(int) for book in class_counts}
for page, book in train_pages:
    for word in page.split():
        word_counts[book][word] += 1

class_likelihoods = {book: defaultdict(float) for book in class_counts}
for book in word_counts:
    total_words = sum(word_counts[book].values())
    for word in vocabulary:
        count = word_counts[book][word]
        class_likelihoods[book][word] = (count + k) / (total_words + k * len(vocabulary))

y_true = [book for _, book in test_pages]
y_pred = []
for page, _ in test_pages:
    log_probs = {book: 0 for book in class_counts}
    for book in class_counts:
        log_probs[book] += math.log(class_priors[book])
        for word in page.split():
            if word in vocabulary:
                log_probs[book] += math.log(class_likelihoods[book][word])
    pred_class = max(log_probs, key=log_probs.get)
    y_pred.append(pred_class)

accuracy = sum([1 for i in range(len(y_true)) if y_true[i] == y_pred[i]]) / len(y_true)
print_confusion_matrix = confusion_matrix(y_true, y_pred, labels=list(class_counts.keys()))
print(f"Confusion matrix:\n{print_confusion_matrix}")
print(f"Test Accuracy for 1e-1: {round(accuracy,4)} = {round(accuracy*100, 4)}%")


Confusion matrix:
[[19  1  2  3  4  6  0]
 [ 0 19  0  1  2 13  3]
 [ 0  0 29  2 13  2  3]
 [ 1  0  0 40 17  9 14]
 [ 0  1  1  2 44 23 40]
 [ 0  1  1  1 14 24 32]
 [ 1  1  0  0  6 14 63]]
Test Accuracy for 1e-1: 0.5042 = 50.4237%
