In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import datasets as ds
from collections import Counter
import re
from bs4 import BeautifulSoup

## The dataset

### Question 1
How many splits does the dataset has?

In [2]:
splits = ds.get_dataset_split_names('imdb')
print(splits)
print(f'Number of splits: {len(splits)}')

['train', 'test', 'unsupervised']
Number of splits: 3


There are 3 splits in the IMDB dataset.

### Question 2
How big are these splits?

In [3]:
dataset_0 = ds.load_dataset('imdb', split=splits[0])
dataset_1 = ds.load_dataset('imdb', split=splits[1])
dataset_2 = ds.load_dataset('imdb', split=splits[2])

Found cached dataset imdb (/home/bastien/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/home/bastien/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/home/bastien/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


In [4]:
print(f'{splits[0]} split size : {dataset_0.num_rows}')
print(f'{splits[1]} split size : {dataset_1.num_rows}')
print(f'{splits[2]} split size : {dataset_2.num_rows}')

train split size : 25000
test split size : 25000
unsupervised split size : 50000


### Question 3
What is the proportion of each class on the supervised splits?

In [5]:
# Dataset 0: Train
train_data_frame = dataset_0.to_pandas()
print("Train")
print(train_data_frame.where(train_data_frame['label'] == 0).count())
print(train_data_frame.where(train_data_frame['label'] == 1).count())

print("\n")
# Dataset 1: Test
print("Test")
test_data_frame = dataset_1.to_pandas()
print(test_data_frame.where(test_data_frame['label'] == 0).count())
print(test_data_frame.where(test_data_frame['label'] == 1).count())


Train
text     12500
label    12500
dtype: int64
text     12500
label    12500
dtype: int64


Test
text     12500
label    12500
dtype: int64
text     12500
label    12500
dtype: int64


Hence, each class represents 50% of the supervised dataset (both in train and test samples).

## Naive Bayes classifier 

### Question 1
Create an adapted processing function which lower case the text and replace punctuations with text:

In [6]:


def clean_html(text: str) -> str:
  """Function which removes unuseful HTML tags of provided text."""
  no_html = BeautifulSoup(text).get_text()
  return no_html

def text_processing(text: str) -> str:
  """Function which  process provided text, it removes punctuations, unuseful spaces and html tags."""
  result_text = text
  result_text = clean_html(result_text)
  result_text = result_text.lower()
  pattern = r"(?<![a-zA-Z])[^\w\s]|[^\w\s](?![a-zA-Z])"
  result_text = re.sub(pattern, "", result_text)
  result_text = result_text.strip()
  return re.sub("(\s+)", " ", result_text)

#tiny test
result = text_processing("Hello,        U.S.A!")
expected = 'hello u.s.a'
assert result == expected or print(result)

Now let's apply `text_processing` function on `text` field of our dataframe.

In [7]:
train_data_frame.text = train_data_frame.text.apply(text_processing)

## Get Vocabulary

### Tokenize function
Function to cut processed text into tokens.

In [8]:
def tokenize(text: str)-> list:
    return [w for w in re.split("\W+", text)]


### Create Dataframe for vocabulary
This function compute the vocabulary dataset.

In [88]:
def compute_vocabulary_series(text_serie: pd.Series) -> Counter:
    vocabulary = None
    for text in text_serie:
        word_list = tokenize(text=text)
        if(vocabulary is None):
            vocabulary =Counter(word_list)
            continue
        vocabulary.update(word_list)
    return vocabulary

Let's compute the vocabulary and change the label type of the train data frame.

In [89]:
vocabulary = compute_vocabulary_series(train_data_frame.text)
train_data_frame.label = train_data_frame.label.astype(str)
counter_class = train_data_frame.groupby("label").agg({'text':compute_vocabulary_series})

Function which returns occurence of a word of a specific class.

In [90]:
def word_count(counter_class: pd.DataFrame, class_name: str, word: str) -> int:
    return counter_class.loc[class_name]["text"][word]

## Implementation of naive Bayes classifier

Let's implement this pseudo code:

![Alt text](nbc.png)

In [94]:
def total_words(vocabulary: Counter, c: str,counter_class: pd.DataFrame):
    total = 0
    for w in vocabulary:
        total += (word_count(counter_class,c,w))
    return total

Function which affect a specified value for loglikelihood dictionnary at index `word,class_value`. It represent the loglikelihood for a word of a specific class.

In [95]:
def fill_loglikelihood(loglikelihood: dict, word: str, class_value:str, value_to_affect: float) -> None:
    if (loglikelihood.get(word) is None):
        loglikelihood[word] = {}
    loglikelihood[word][class_value] = value_to_affect

In [102]:
def naive_bayes_classifier(train_data_frame: pd.DataFrame, vocabulary: Counter, counter_class: pd.DataFrame) :
    total_document_count = train_data_frame.text.count()
    class_label_set = list(train_data_frame.groupby("label").groups.keys())
    logprior = {}
    loglikelihood ={}
    for current_class in class_label_set:
        class_document_count = train_data_frame[train_data_frame.label == current_class].text.count()
        logprior[current_class] = np.log(class_document_count/total_document_count)
        counter = 0
        total = total_words(vocabulary,current_class,counter_class) + len(vocabulary)
        
        for word in vocabulary:
            count_w_c = word_count(counter_class, current_class, word) +1
            log_like_value = np.log(count_w_c/total)
            fill_loglikelihood(loglikelihood,word,current_class,log_like_value)
            
    return logprior, loglikelihood , vocabulary



## Test Naive Bayes classifier

Here is the test function of our naive Bayes classifier which apply for one testdoc.

In [103]:
def test_naive_bayes_classifier(testdoc :str, logprior:dict, loglikelihood: dict, train_data_frame: pd.DataFrame, vocabulary: Counter) -> tuple:
    class_set = list(train_data_frame.groupby("label").groups.keys())
    sums = {}
    max_class = None
    for c in class_set:
        sums[c] = logprior[c]
        word_list = tokenize(testdoc)
        for w in word_list:
            if(vocabulary[w] != 0):
                sums[c] = sums[c] + loglikelihood[w][c]
        if (max_class is None or sums[max_class] < sums[c]):
            max_class = c
        
    return max_class

### Process data
Make prediction on each text from `test_data_frame` and store them in `model_result`. 

In [104]:
test_data_frame.label = test_data_frame.label.astype(str)
logprior, loglikelihood, vocabulary =naive_bayes_classifier(train_data_frame,vocabulary,counter_class)
test_data_frame_2 = test_data_frame
test_nbc_function = lambda text : test_naive_bayes_classifier(text, logprior,loglikelihood,train_data_frame,vocabulary)
test_data_frame_2["model_result"] = test_data_frame_2.text.apply(test_nbc_function)

### Get results
We are now able to get the good predictions count, hence we can get an accuracy ratio.

In [106]:
good_predictions_count = (test_data_frame_2[test_data_frame_2.label == test_data_frame_2.model_result]).label.count()
text_count = test_data_frame_2.text.count()
accuracy_ratio = good_predictions_count/text_count
print(f'{ accuracy_ratio*100 = }')

 accuracy_ratio*100 = 82.396
