## Imports

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import datasets as ds
from collections import Counter

from scripts import data

## The dataset

### Question 1
How many splits does the dataset has?

In [4]:
splits: list[str] = ds.get_dataset_split_names('imdb')
print('Splits:')
for split in splits:
    print(f'\'{split}\'')
print(f'Number of splits: {len(splits)}')

Splits:
'train'
'test'
'unsupervised'
Number of splits: 3


There are 3 splits in the IMDB dataset.

### Question 2
How big are these splits?

In [5]:
datasets: list[ds.Dataset] = data.load_datasets(splits=splits)
print('Dataset sizes:')
for i, dataset in enumerate(datasets):
    print(f'\'{splits[i]}\' split size : {dataset.num_rows}')

Found cached dataset imdb (/Users/francois.soulier/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/Users/francois.soulier/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/Users/francois.soulier/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


Dataset sizes:
'train' split size : 25000
'test' split size : 25000
'unsupervised' split size : 50000


### Question 3
What is the proportion of each class on the supervised splits?

In [6]:
# Get only supervised datasets
supervised_datasets: list[pd.DataFrame] = data.datasets_to_dataframes(datasets[0:2])

print('Supervised dataset sizes:')
# For each dataset, print the number of samples for each class
for i, dataset in enumerate(supervised_datasets):
    print(f'\'{splits[i]}\'')
    print('Class 0')
    print(dataset.where(dataset['label'] == 0).count())
    print('Class 1')
    print(dataset.where(dataset['label'] == 1).count())
    print('\n')

Supervised dataset sizes:
'train'
Class 0
text     12500
label    12500
dtype: int64
Class 1
text     12500
label    12500
dtype: int64


'test'
Class 0
text     12500
label    12500
dtype: int64
Class 1
text     12500
label    12500
dtype: int64




Hence, each class represents 50% of the supervised dataset (both in train and test samples).

## Naive Bayes classifier 

### Question 1
Create an adapted processing function which lower case the text and replace punctuations with text:

#### Tiny test (preprocessing)

In [7]:
data.test_preprocessing("Hello, ,,,World!::", "hello world")
data.test_preprocessing("Hello,        U.S.A!", "hello u.s.a")



  no_html = BeautifulSoup(text).get_text()


Now let's apply the preprocessing to the `text` field of our training and testing dataset.

In [8]:
train_df, test_df = data.processed_dataframes(supervised_datasets)



### Tokenize function
Function to cut processed text into tokens.

In [None]:
def tokenize(text: str)-> list:
    """
    Tokenizes the given text.
    Args:
        text (str): Text to tokenize (pre-processed)
    Returns:
        list: List of tokens
    """
    return [w for w in re.split("\W+", text)]

### Create Dataframe for vocabulary
This function compute the vocabulary dataset.

In [None]:
def build_vocabulary(texts_serie: pd.Series) -> Counter:
    """
    Builds the vocabulary of the given texts serie.
    Args:
        text_serie (pd.Series): Text serie
    Returns:
        Counter: Vocabulary
    """
    vocabulary: Counter = None # Use Counter as a dictionary with word occurrences
    for text in texts_serie:
        word_list: list[str] = tokenize(text=text)
        if vocabulary is None:
            vocabulary = Counter(word_list)
        else:
            vocabulary.update(word_list)
    return vocabulary

Let's compute the vocabulary and change the label type of the train data frame.

In [None]:
vocabulary: Counter = build_vocabulary(texts_serie=train_data_frame.text)
train_data_frame.label = train_data_frame.label.astype(str)
counter_class: pd.DataFrame = train_data_frame.groupby("label").agg({'text': build_vocabulary})

Function which returns occurence of a word of a specific class.

In [None]:
def word_count(counter_class: pd.DataFrame, class_name: str, word: str) -> int:
    """
    Returns the number of occurrences of the given word in the given class.
    Args:
        counter_class (pd.DataFrame): DataFrame with the vocabulary of each class
        class_name (str): Class name / label
        word (str): Word
    Returns:
        int: Number of occurrences of the given word in the given class
    """
    return counter_class.loc[class_name]["text"][word]

## Implementation of naive Bayes classifier

Let's implement this pseudo-code:

![Alt text](nbc.png)

In [None]:
def total_words(vocabulary: Counter, c: str, counter_class: pd.DataFrame) -> int:
    """
    Returns the total number of words in a class for the given vocabulary.
    Args:
        vocabulary (Counter): Vocabulary
        c (str): Class name / label
        counter_class (pd.DataFrame): DataFrame with the vocabulary of each class
    Returns:
        int: Total number of words in the given class
    """
    total: int = 0
    for w in vocabulary:
        total += word_count(counter_class, c, w)
    return total

Function which affect a specified value for loglikelihood dictionnary at index `word,class_value`. It represent the loglikelihood for a word of a specific class.

In [None]:
def fill_loglikelihood(loglikelihood: dict, word: str, class_value: str, value_to_affect: float) -> None:
    """
    Fills the loglikelihood dictionary with the given values.
    Args:
        loglikelihood (dict): Loglikelihood dictionary
        word (str): Word
        class_value (str): Class name / label
        value_to_affect (float): Value to affect
    """
    if (loglikelihood.get(word) is None):
        loglikelihood[word] = {}
    loglikelihood[word][class_value] = value_to_affect

In [None]:
def naive_bayes_classifier(train_data_frame: pd.DataFrame, vocabulary: Counter, counter_class: pd.DataFrame) -> tuple[dict, dict, Counter]:
    """
    Builds the Naive Bayes classifier.
    Args:
        train_data_frame (pd.DataFrame): Training data frame
        vocabulary (Counter): Vocabulary
        counter_class (pd.DataFrame): DataFrame with the vocabulary of each class
    Returns:
        tuple[dict, dict, Counter]: Tuple with the logprior, loglikelihood and vocabulary
    """
    total_document_count: int = train_data_frame.text.count()
    class_label_set: list = list(train_data_frame.groupby("label").groups.keys())
    logprior: dict = {}
    loglikelihood: dict = {}
    
    for current_class in class_label_set:
        class_document_count: int = train_data_frame[train_data_frame.label == current_class].text.count()
        logprior[current_class] = np.log(class_document_count/total_document_count)
        total: int = total_words(vocabulary,current_class,counter_class) + len(vocabulary)
        
        for word in vocabulary:
            count_w_c = word_count(counter_class, current_class, word) + 1
            log_like_value = np.log(count_w_c / total)
            fill_loglikelihood(loglikelihood,word,current_class,log_like_value)
            
    return logprior, loglikelihood, vocabulary

## Test Naive Bayes classifier

Here is the test function of our naive Bayes classifier which apply for one testdoc.

In [None]:
def test_naive_bayes_classifier(testdoc: str, logprior: dict, loglikelihood: dict, train_data_frame: pd.DataFrame, vocabulary: Counter) -> tuple:
    """
    Tests the Naive Bayes classifier.
    Args:
        testdoc (str): Test document
        logprior (dict): Logprior
        loglikelihood (dict): Loglikelihood
        train_data_frame (pd.DataFrame): Training data frame
        vocabulary (Counter): Vocabulary
    Returns:
        tuple: Tuple with the predicted class and the loglikelihood
    """
    class_set: list = list(train_data_frame.groupby("label").groups.keys())
    sums: dict = {}
    max_class = None

    for c in class_set:
        sums[c] = logprior[c]
        word_list = tokenize(testdoc)
        for w in word_list:
            if(vocabulary[w] != 0):
                sums[c] = sums[c] + loglikelihood[w][c]

        if (max_class is None or sums[max_class] < sums[c]):
            max_class = c
        
    return max_class

### Process data
Make prediction on each text from `test_data_frame` and store them in `model_result`. 

In [None]:
test_data_frame.label = test_data_frame.label.astype(str)
logprior, loglikelihood, vocabulary = naive_bayes_classifier(train_data_frame, vocabulary, counter_class)
test_data_frame_2: pd.DataFrame = test_data_frame
test_nbc_function = lambda text : test_naive_bayes_classifier(text, logprior,loglikelihood,train_data_frame,vocabulary)
test_data_frame_2["model_result"] = test_data_frame_2.text.apply(test_nbc_function)

### Get results
We are now able to get the good predictions count, hence we can get an accuracy ratio.

In [None]:
good_predictions_count: int = (test_data_frame_2[test_data_frame_2.label == test_data_frame_2.model_result]).label.count()
text_count: int = test_data_frame_2.text.count()
accuracy_ratio: float = good_predictions_count / text_count
print(f'Test accuracy : {(accuracy_ratio * 100):.2f}')

Test accuracy : 81.18


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [None]:
vect = CountVectorizer()

X_train = vect.fit_transform(train_data_frame.text)
X_test = vect.transform(test_data_frame.text)
y_train = train_data_frame.label
y_test = test_data_frame.label

clf = MultinomialNB(alpha=50.0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f'{accuracy_score(y_test, y_pred)*100 = }')

accuracy_score(y_test, y_pred)*100 = 82.708


### Question 2

### Question 3

### Question 4

### Question 5

#### A revoir

L'hyperparamètre $\alpha$ permet de réguler le sur-apprentissage. En effet, si $\alpha$ est trop grand, le modèle va être trop régularisé et donc ne pas être capable de prédire correctement les données. Si $\alpha$ est trop petit, le modèle va être trop adapté aux données d'entrainement et donc ne pas être capable de prédire correctement les données de test. C'est un paramètre que l'on peut ajuster dans l'implémentation `scikit-learn`, mais pas notre propre implémentation.

### Question 6

The accuracy metrics is a sufficient metric to measure the performance of our model. Indeed, the dataset is equally distributed between the classes and are well separated between positive and negative sentiments.