# First approach: Naive Bayes Classifier

## Dataset preprocess

In [4]:
import pandas as pd

df = pd.read_csv('../data/train_imdb_reviews.csv')

In [5]:
# This cell is only for experiments of balancing dataset sentiments
# We don't use this cell for the original experiments, only for the experiments of balancing dataset sentiments

positive_count = df[df["sentiment"] == 1].shape[0]
negative_count = df[df["sentiment"] == 0].shape[0]
print("Total reviews:", df.shape[0])
print("Positive reviews:", positive_count)
print("Negative reviews:", negative_count)
print("ratio:", positive_count/negative_count)

print("-------------------------")

positive = df[df["sentiment"] == 1][:negative_count]
negative = df[df["sentiment"] == 0]
print("Positive reviews:", positive.shape[0])
print("Negative reviews:", negative.shape[0])
print("ratio:", positive.shape[0]/negative.shape[0])

df = pd.concat([positive, negative])

Total reviews: 41669
Positive reviews: 27295
Negative reviews: 14374
ratio: 1.8989147071100598
-------------------------
Positive reviews: 14374
Negative reviews: 14374
ratio: 1.0


In [None]:
# get the training data, and convert to numpy array
X_train = df['review']
y_train = df['sentiment']

X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

print(len(X_train))
print(len(y_train))

In [7]:
import nltk
from nltk.probability import FreqDist
from itertools import chain
from nltk.tokenize import word_tokenize

# get the frequency distribution of the words
freqDist = FreqDist()
for review in X_train:
    # tokenize the review, and add the frequency of each word to the frequency distribution
    tokens = word_tokenize(review)
    for word in tokens:
        freqDist[word.lower()] += 1

In [8]:
# we only use the 10000 most frequent words as features
word_features = list(freqDist.keys())[0:10000]

# define a function to extract the features of a document
def document_features(document):
    # convert the document to a set of unique words
    document_words = set(document)
    features = {}
    for word in word_features:
        # create a feature for each word, indicating whether the document contains that word
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [9]:
# combine the tokenized features with the labels
combined_data_train = [(document_features(word_tokenize(review)), sentiment) for review, sentiment in zip(X_train, y_train)]

## create a Naive Bayes classifier model

In [None]:
# do 5-fold cross-validation to evaluate the classifier
from nltk import NaiveBayesClassifier

# set the number of folds and the size of each fold
num_folds = 5
subset_size = len(combined_data_train) // num_folds
# initialize the best classifier and the best accuracy, since we want to get the best classifier in the end
best_classifier = None
best_accuracy = 0.0

# to store the confusion matrix, we need to accumulate the values for each fold
train_tp = 0
train_fp = 0
train_fn = 0
train_tn = 0

for i in range(num_folds):
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    # get the testing and training data for this fold
    testing_this_round = combined_data_train[i*subset_size:][:subset_size]
    training_this_round = combined_data_train[:i*subset_size] + combined_data_train[(i+1)*subset_size:]

    # train the classifier for this round
    classifier = NaiveBayesClassifier.train(training_this_round)
    
    # test the classifier for this round
    predictions = []
    for review, sentiment in testing_this_round:
        predictions.append(classifier.classify(review))
    
    # calculate the confusion matrix for this round
    for j in range(len(predictions)):
        if predictions[j] == 1:
            if testing_this_round[j][1] == 1:
                tp += 1
            else:
                fp += 1
        else:
            if testing_this_round[j][1] == 1:
                fn += 1
            else:
                tn += 1

    # accumulate to the confusion matrix values
    train_tp += tp
    train_fp += fp
    train_fn += fn
    train_tn += tn

    acc = (tp + tn) / (tp + tn + fp + fn)
    print('Round', i, 'accuracy:', acc)
    # if the classifier for this round is the best so far, save it
    if acc > best_accuracy:
        print('currently best classifier for round', i)
        best_accuracy = acc
        best_classifier = classifier

print("true positive:", train_tp)
print("true negative:", train_tn)
print("false positive:", train_fp)
print("false negative:", train_fn)

In [None]:
# calculate the evaluation metrics for training process
def auc(tp, fp, tn, fn):
    return (tp / (tp + fn) + tn / (tn + fp)) / 2

tp = train_tp
fp = train_fp
tn = train_tn
fn = train_fn

print("true positives: ", tp)
print("true negatives: ", tn)
print("false positives: ", fp)
print("false negatives: ", fn)

acc = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Accuracy: ", acc)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)
print("AUC: ", auc(tp, fp, tn, fn))
print(tp + tn + fp + fn)

## Measure the results

In [13]:
# train the final classifier again with all training dataset using the best classifier from the cross-validation
final_classifier = best_classifier.train(combined_data_train)

In [14]:
# save the final classifier
import pickle

with open('nbc_balanced_f_{}.pkl'.format(round(acc, 4)), 'wb') as f:
    pickle.dump(final_classifier, f)

In [15]:
# load the testing dataset
df_test = pd.read_csv('../data/test_imdb_reviews.csv')

X_test = df_test['review'].astype(str)
y_test = df_test['sentiment']
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

len(X_test)
len(y_test)


4630

In [16]:
# do the same tokenization and feature extraction for the testing dataset
combined_data_test = [(document_features(word_tokenize(review)), sentiment) for review, sentiment in zip(X_test, y_test)]

In [17]:
predictions = []

# test the final classifier with the testing dataset
for review, sentiment in combined_data_test:
    predictions.append(final_classifier.classify(review))

predictions[:10]

[1, 1, 1, 1, 0, 1, 1, 1, 1, 0]

In [None]:
# calculate the confusion matrix for the testing dataset
test_tp = 0
test_tn = 0
test_fp = 0
test_fn = 0
for i, prediction in enumerate(predictions):
    if prediction == y_test[i]:
        if prediction == 1:
            test_tp += 1
        else:
            test_tn += 1
    else:
        if prediction == 1:
            test_fp += 1
        else:
            test_fn += 1

print("true positives: ", test_tp)
print("true negatives: ", test_tn)
print("false positives: ", test_fp)
print("false negatives: ", test_fn)


In [None]:
# calculate the evaluation metrics for testing process

acc = (test_tp + test_tn) / (test_tp + test_tn + test_fp + test_fn)
precision = test_tp / (test_tp + test_fp)
recall = test_tp / (test_tp + test_fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Accuracy: ", acc)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)
print("AUROC: ", auc(test_tp, test_fp, test_tn, test_fn))