In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import datasets as ds


## The dataset

### Question 1
How many splits does the dataset has?

In [2]:
splits = ds.get_dataset_split_names('imdb')
print(splits)
print(f'Number of splits: {len(splits)}')

['train', 'test', 'unsupervised']
Number of splits: 3


There are 3 splits in the IMDB dataset.

### Question 2
How big are these splits?

In [3]:
dataset_0 = ds.load_dataset('imdb', split=splits[0])
dataset_1 = ds.load_dataset('imdb', split=splits[1])
dataset_2 = ds.load_dataset('imdb', split=splits[2])

Found cached dataset imdb (/home/bastien/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/home/bastien/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/home/bastien/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


In [4]:
print(f'{splits[0]} split size : {dataset_0.num_rows}')
print(f'{splits[1]} split size : {dataset_1.num_rows}')
print(f'{splits[2]} split size : {dataset_2.num_rows}')

train split size : 25000
test split size : 25000
unsupervised split size : 50000


### Question 3
What is the proportion of each class on the supervised splits?

In [5]:
# Dataset 0: Train
train_data_frame = dataset_0.to_pandas()
print("Train")
print(train_data_frame.where(train_data_frame['label'] == 0).count())
print(train_data_frame.where(train_data_frame['label'] == 1).count())

print("\n")
# Dataset 1: Test
print("Test")
test_data_frame = dataset_1.to_pandas()
print(test_data_frame.where(test_data_frame['label'] == 0).count())
print(test_data_frame.where(test_data_frame['label'] == 1).count())


Train
text     12500
label    12500
dtype: int64
text     12500
label    12500
dtype: int64


Test
text     12500
label    12500
dtype: int64
text     12500
label    12500
dtype: int64


Hence, each class represents 50% of the supervised dataset (both in train and test samples).

## Naive Bayes classifier 

### Question 1
Create an adapted processing function which lower case the text and replace punctuations with text:

In [6]:
import re
from bs4 import BeautifulSoup

def clean_html(text: str) -> str:
  """Function which removes unuseful HTML tags of provided text."""
  no_html = BeautifulSoup(text).get_text()
  return no_html

def text_processing(text: str) -> str:
  """Function which  process provided text, it removes punctuations, unuseful spaces and html tags."""
  result_text = text
  result_text = clean_html(result_text)
  result_text = result_text.lower()
  pattern = r"(?<![a-zA-Z])[^\w\s]|[^\w\s](?![a-zA-Z])"
  result_text = re.sub(pattern, "", result_text)
  result_text = result_text.strip()
  return re.sub("(\s+)", " ", result_text)

#tiny test
result = text_processing("Hello,        U.S.A!")
expected = 'hello u.s.a'
assert result == expected or print(result)

Now let's apply `text_processing` function on `text` field of our dataframe.

In [7]:
train_data_frame.text = train_data_frame.text.apply(text_processing)

## Get Vocabulary

### Tokenize function
Function to cut processed text into tokens.

In [8]:
def tokenize(text: str)-> list:
    return [w for w in re.split("\W+", text)]


### Create Dataframe for vocabulary
This function compute the vocabulary dataset.

In [9]:
from collections import Counter

def compute_vocabulary(train_data_frame: pd.DataFrame) -> Counter:
    vocabulary = None
    for text in train_data_frame.text:
        word_list = tokenize(text=text)
        if(vocabulary is None):
            vocabulary =Counter(word_list)
            continue
        vocabulary.update(word_list)
    return vocabulary

def compute_vocabulary_series(text_serie: pd.Series) -> Counter:
    vocabulary = None
    for text in text_serie:
        word_list = tokenize(text=text)
        if(vocabulary is None):
            vocabulary =Counter(word_list)
            continue
        vocabulary.update(word_list)
    return vocabulary


In [10]:
vocabulary = compute_vocabulary(train_data_frame)

In [11]:
train_data_frame.label = train_data_frame.label.astype(str)

In [12]:
counter_class = train_data_frame.groupby("label").agg({'text':compute_vocabulary_series})

In [13]:
def word_count(counter_class: pd.DataFrame, class_name: str, word: str) -> int:
    return counter_class.loc[class_name]["text"][word]

### Implementation

Let's implement this pseudo code:

![Alt text](nbc.png)

In [30]:
total = 0
c = "1"

def total_words(vocabulary: Counter, c: str):
    total = 0
    for w in vocabulary:
        total += (word_count(counter_class,c,w)+1)
    return total
    
total_words(vocabulary,"1")

3089707

In [32]:
def fill_loglikelihood(loglikelihood: dict, w: str, c:str, value: float):
    if (loglikelihood.get(w) is None):
        loglikelihood[w] = {}
    loglikelihood[w][c] = value

In [None]:
def naive_bayes_classifier(train_data_frame: pd.DataFrame, vocabulary: Counter, counter_class: pd.DataFrame) :
    n_doc = train_data_frame.text.count()
    class_set = list(train_data_frame.groupby("label").groups.keys())
    logprior = {}
    loglikelihood ={}
    for c in class_set:
        n_c = train_data_frame[train_data_frame.label == c].text.count()
        print(n_doc)
        big_doc = train_data_frame[train_data_frame.label == c].text
        logprior[c] = np.log(n_c/n_doc)
        counter = 0
        total = total_words(vocabulary,c) + len(vocabulary)
        for w in vocabulary:
            count_w_c = word_count(counter_class, c, w) +1
            value = np.log(count_w_c/total)
            fill_loglikelihood(loglikelihood,w,c,value)
            counter +=1
            
            
    return logprior,loglikelihood   , vocabulary

logprior,loglikelihood , vocabulary =naive_bayes_classifier(train_data_frame,vocabulary,counter_class)

### Test Naive Bayes classifier

In [50]:
def test_naive_bayes_classifier(testdoc :str, logprior:dict, loglikelihood: dict, train_data_frame: pd.DataFrame, vocabulary: Counter) -> tuple:
    class_set = list(train_data_frame.groupby("label").groups.keys())
    sums = {}
    max_class = None
    for c in class_set:
        sums[c] = logprior[c]
        word_list = tokenize(testdoc)
        for w in word_list:
            if(vocabulary[w] != 0):
                sums[c] = sums[c] + loglikelihood[w][c]
        if (max_class is None or sums[max_class] < sums[c]):
            max_class = c
        
    return max_class

test_data_frame.label = test_data_frame.label.astype(str)


'0'

In [None]:
test_naive_bayes_classifier(test_data_frame.text[0], logprior,loglikelihood,train_data_frame,vocabulary)

In [64]:
test_data_frame_2 = test_data_frame
apply_function = lambda text : test_naive_bayes_classifier(text, logprior,loglikelihood,train_data_frame,vocabulary)

test_data_frame_2["model_result"] = test_data_frame_2.text.apply(apply_function)

In [85]:
good_predictions_count = (test_data_frame_2[test_data_frame_2.label == test_data_frame_2.model_result]).label.count()
text_count = test_data_frame_2.text.count()
accuracy_ratio = good_predictions_count/text_count
print(f'{ accuracy_ratio*100 = }')

 accuracy_ratio*100 = 82.44399999999999
