In [1]:
from datasets import load_dataset


dataset = load_dataset("imdb")

Found cached dataset imdb (/home/coartix/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

## The dataset

In [45]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

1. The dataset has 3 splits

2. The train and test splits have 25000 rows, and the unsupervised split has 50000 rows.

In [3]:
pos_train = sum(dataset["train"]["label"]) / len(dataset["train"]["label"])
pos_test = sum(dataset["test"]["label"]) / len(dataset["test"]["label"])
neg_train = 1 - pos_train
neg_test = 1 - pos_test

print("Proportion of positive labels in train split: ", pos_train)
print("Proportion of positive labels in test split: ", pos_test)
print("Proportion of negative labels in train split: ", neg_train)
print("Proportion of negative labels in test split: ", neg_test)

Proportion of positive labels in train split:  0.5
Proportion of positive labels in test split:  0.5
Proportion of negative labels in train split:  0.5
Proportion of negative labels in test split:  0.5


3. 50% of positive/negative label are in the train split and test plit.

## Naive Bayes Classifier

In [52]:
# Create an adpated preprocessing function which at least lowers the text and replace punctuations with spaces
# You can use from string import punctuation to get a list of punctuations, maybe not all punctuations should be removed
from string import punctuation
import numpy as np

def preprocess_function(dataset):
    '''
        Input: dataset
        Output: train_docs, test_docs, train_classes, test_classes
    '''
    punct = punctuation.replace("-", "")
    # lower and change punctuation into spaces and replace multiple spaces for only one
    train_docs = np.array([text.lower().translate(str.maketrans(punct, " " * len(punct))) for text in dataset["train"]["text"]])
    test_docs = np.array([text.lower().translate(str.maketrans(punct, " " * len(punct))) for text in dataset["train"]["text"]])
    train_classes = np.array(dataset["train"]["label"])
    test_classes = np.array(dataset["test"]["label"])
    return train_docs, test_docs, train_classes, test_classes

In [58]:
import math

def train_naive_bayes(docs: np.ndarray, classes: np.ndarray):
    '''
        Input: docs, classes
        Output: log_prior, log_likelihood, vocab
    '''
    # Get each word uniquely in the docs
    vocab = np.unique(np.concatenate([text.split() for text in docs]))
    
    Ndoc = len(docs)

    # Get the number of different classes
    unique_classes = np.unique(classes)
    for c in unique_classes:
        Nc = sum(classes == c)

        # Compute the log prior probability of class c
        log_prior = math.log(Nc / Ndoc)
        
        # create bigdoc which is an histogram of the words in the docs of class c
        bigdoc = np.concatenate([text.split() for text in docs[classes == c]])
        histo = np.array([sum(bigdoc == word) for word in vocab])

        # Compute the log likelihood of each word in the vocabulary for class c
        log_likelihood = np.log((histo + 1) / (len(bigdoc) + len(vocab)))
        print('Half done')
    
    return log_prior, log_likelihood, vocab

In [28]:
def test_naive_bayes(test_doc: str, log_prior: float, log_likelihood: np.ndarray, unique_test_classes: np.ndarray, vocab: np.ndarray):
    '''
        Input: test_doc, log_prior, log_likelihood, test_classes, vocab
        Output: accuracy
    '''
    sums = np.zeros(len(unique_test_classes))
    for c in unique_test_classes:
        # Compute the log posterior probability of class c
        log_posterior = log_prior + sum(log_likelihood[np.where(np.isin(vocab, test_doc.split()))])
        sums[c] = log_posterior
    return np.argmax(sums)
    
    

In [60]:
# Call the preprocessing function
train_docs, test_docs, train_classes, test_classes = preprocess_function(dataset)
print(train_docs.shape, test_docs.shape, train_classes.shape, test_classes.shape)
vocab = np.unique(np.concatenate([text.split() for text in train_docs]))
print(vocab.shape)


(25000,) (25000,) (25000,) (25000,)
(95669,)


In [61]:
# Train the model with a hundrendth of the training data
log_prior, log_likelihood, vocab = train_naive_bayes(train_docs[:int(len(train_docs)/100)], train_classes[:int(len(train_classes)/100)])
print(log_prior, log_likelihood.shape, vocab.shape)

7528
ok
yes
no
0.0 (7528,) (7528,)


In [67]:
# Test the model with for 1000 test data
unique_test_classes = np.unique(test_classes)
accuracy = 0
for i in range(25000):
    accuracy += test_naive_bayes(test_docs[i], log_prior, log_likelihood, unique_test_classes, vocab) == test_classes[i]
accuracy /= 25000
print(accuracy)


0.5
