In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import datasets as ds
from collections import Counter

from scripts import data
from scripts.naive_bayes import from_scratch as naive_bayes
from scripts.naive_bayes import scikit_learn as sk_naive_bayes

  from .autonotebook import tqdm as notebook_tqdm


# The dataset

### Question 1
How many splits does the dataset has?

In [2]:
splits: list[str] = ds.get_dataset_split_names('imdb')
print('Splits:')
for split in splits:
    print(f'\'{split}\'')
print(f'Number of splits: {len(splits)}')

Splits:
'train'
'test'
'unsupervised'
Number of splits: 3


There are 3 splits in the IMDB dataset.

### Question 2
How big are these splits?

In [3]:
datasets: list[ds.Dataset] = data.load_datasets(splits=splits)
print('Dataset sizes:')
for i, dataset in enumerate(datasets):
    print(f'\'{splits[i]}\' split size : {dataset.num_rows}')

Found cached dataset imdb (/Users/francois.soulier/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/Users/francois.soulier/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/Users/francois.soulier/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


Dataset sizes:
'train' split size : 25000
'test' split size : 25000
'unsupervised' split size : 50000


### Question 3
What is the proportion of each class on the supervised splits?

In [4]:
# Get only supervised datasets
supervised_datasets: list[pd.DataFrame] = data.datasets_to_dataframes(datasets[0:2])

print('Supervised dataset sizes:')
# For each dataset, print the number of samples for each class
for i, dataset in enumerate(supervised_datasets):
    print(f'\'{splits[i]}\'')
    print('Class 0')
    print(dataset.where(dataset['label'] == 0).count())
    print('Class 1')
    print(dataset.where(dataset['label'] == 1).count())
    print('\n')

Supervised dataset sizes:
'train'
Class 0
text     12500
label    12500
dtype: int64
Class 1
text     12500
label    12500
dtype: int64


'test'
Class 0
text     12500
label    12500
dtype: int64
Class 1
text     12500
label    12500
dtype: int64




Hence, each class represents 50% of the supervised dataset (both in train and test samples).

# Naive Bayes classifier 

### Question 1

#### Preprocessing (test)

In [5]:
data.test_preprocessing("Hello, ,,,World!::", "hello world")
data.test_preprocessing("Hello,        U.S.A!", "hello u.s.a")



  no_html = BeautifulSoup(text).get_text()


Now let's apply the preprocessing to the `text` field of our training and testing dataset.

In [6]:
train_df, test_df = data.processed_dataframes(supervised_datasets)

  no_html = BeautifulSoup(text).get_text()


### Question 2

Let's build the vocabulary and change the label type of the train data frame.

In [7]:
vocabulary: Counter = naive_bayes.build_vocabulary(texts_serie=train_df.text)
train_df.label = train_df.label.astype(str)
counter_class: pd.DataFrame = train_df.groupby("label").agg({'text': naive_bayes.build_vocabulary})

### Naive Bayes classifier pseudo-code

<img src="./nbc.png" width="30%" height="20%">

### 'From scratch' Naive Bayes classifier

#### Predictions
Make prediction on each text of the training and testing sets and store them in a `'model_result'` column in the pandas dataframe.

In [8]:
test_df.label = test_df.label.astype(str)
logprior, loglikelihood, vocabulary = naive_bayes.classifier(train_df, vocabulary, counter_class)

test_classifier = lambda text : naive_bayes.test_classifier(text, logprior, loglikelihood, train_df, vocabulary)

train_df["model_result"] = train_df.text.apply(test_classifier)
test_df["model_result"] = test_df.text.apply(test_classifier)

### Question 4 ('From scratch' implementation)

#### Results
We are now able to get the good predictions count, hence we can get an accuracy ratio.

In [9]:
naive_bayes.display_results(train_df, "Train")
naive_bayes.display_results(test_df, "Test")

Train accuracy: 89.84%
Test accuracy: 81.18%


### Question 3

In [10]:
train_df, test_df = data.processed_dataframes(supervised_datasets)



  no_html = BeautifulSoup(text).get_text()
  no_html = BeautifulSoup(text).get_text()


In [11]:
pipeline = sk_naive_bayes.pipeline()

# Train the model on the training set
pipeline.fit(train_df.text, train_df.label)

# Make predictions on the train and test sets
train_predictions = pipeline.predict(train_df.text)
test_predictions = pipeline.predict(test_df.text)

### Question 4 (scikit-learn implementation)

In [12]:
# Print the classificiation report
print('Classification report (testing):')
sk_naive_bayes.print_classification_report(test_df, test_predictions)

# Print the confusion matrix
print('Confusion matrix (testing):')
sk_naive_bayes.print_confusion_matrix(test_df, test_predictions)
print('\n')

# Print the accuracy score (training)
sk_naive_bayes.print_accuracy_score(train_df, train_predictions, 'Training')
# Print the accuracy score (testing)
sk_naive_bayes.print_accuracy_score(test_df, test_predictions, 'Testing')

Classification report (testing):
              precision    recall  f1-score   support

           0       0.78      0.88      0.83     12500
           1       0.86      0.76      0.80     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000

Confusion matrix (testing):
[[10969  1531]
 [ 3052  9448]]


Training accuracy: 90.32%
Testing accuracy: 81.67%


### Question 5 (FIXME)

TODO: A revoir (regarder la doc de CountVectorizer)

L'hyperparamètre $\alpha$ permet de réguler le sur-apprentissage. En effet, si $\alpha$ est trop grand, le modèle va être trop régularisé et donc ne pas être capable de prédire correctement les données. Si $\alpha$ est trop petit, le modèle va être trop adapté aux données d'entrainement et donc ne pas être capable de prédire correctement les données de test. C'est un paramètre que l'on peut ajuster dans l'implémentation `scikit-learn`, mais pas notre propre implémentation.

### Question 6

The accuracy metrics is a sufficient metric to measure the performance of our model. Indeed, the dataset is equally distributed between the classes and are well separated between positive and negative sentiments.

### Question 7 (TODO)

### Question 8

#### i - Highest likelihood

Look at the words with the highest likelihood in each class (if you use scikit-learn, you want to check feature_log_prob_).

In [13]:
# Print the most likely words
sk_naive_bayes.print_most_likely_words(pipeline)

Class:  0
Most likely words:  ['was', 'that', 'this', 'in', 'it', 'is', 'to', 'of', 'and', 'the']

Class:  1
Most likely words:  ['as', 'this', 'that', 'it', 'in', 'is', 'to', 'of', 'and', 'the']



#### ii - Stopwords removal

##### Download stopwords

In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/francois.soulier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

##### Training and testing

In [15]:
stop_words: list[str] = sk_naive_bayes.get_stopwords()
pipeline = sk_naive_bayes.pipeline(stop_words=stop_words)

# Train the model on the training set
pipeline.fit(train_df.text, train_df.label)

# Make predictions on the train and test sets
train_predictions = pipeline.predict(train_df.text)
test_predictions = pipeline.predict(test_df.text)

# Print the accuracy score (training)
sk_naive_bayes.print_accuracy_score(train_df, train_predictions, 'Training')
# Print the accuracy score (testing)
sk_naive_bayes.print_accuracy_score(test_df, test_predictions, 'Testing')

Training accuracy: 91.91%
Testing accuracy: 82.74%


# Stemming and Lemmatization

### Question 1

#### Download wordnet

In [19]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/francois.soulier/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

The new preprocessing function is implemented in './scripts/naive_bayes/scikit_learn.py'. The text preprocessing now includes lemmatization.

### Question 2 (FIXME)

In [20]:
# pipeline = sk_naive_bayes.pipeline(stop_words=stop_words)

# train_df = train_df.text.apply(sk_naive_bayes.preprocess)
# test_df = test_df.text.apply(sk_naive_bayes.preprocess)

# # Train the model on the training set
# pipeline.fit(train_df.text, train_df.label)

# # Make predictions on the train and test sets
# train_predictions = pipeline.predict(train_df.text)
# test_predictions = pipeline.predict(test_df.text)

# # Print the accuracy score (training)
# sk_naive_bayes.print_accuracy_score(train_df, train_predictions, 'Training')
# # Print the accuracy score (testing)
# sk_naive_bayes.print_accuracy_score(test_df, test_predictions, 'Testing')

### Question 3 (TODO)