In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import datasets as ds


## The dataset

### Question 1
How many splits does the dataset has?

In [33]:
splits = ds.get_dataset_split_names('imdb')
print(splits)
print(f'Number of splits: {len(splits)}')

['train', 'test', 'unsupervised']
Number of splits: 3


There are 3 splits in the IMDB dataset.

### Question 2
How big are these splits?

In [34]:
dataset_0 = ds.load_dataset('imdb', split=splits[0])
dataset_1 = ds.load_dataset('imdb', split=splits[1])
dataset_2 = ds.load_dataset('imdb', split=splits[2])

Found cached dataset imdb (/home/bastien/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/home/bastien/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/home/bastien/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


In [35]:
print(f'{splits[0]} split size : {dataset_0.num_rows}')
print(f'{splits[1]} split size : {dataset_1.num_rows}')
print(f'{splits[2]} split size : {dataset_2.num_rows}')

train split size : 25000
test split size : 25000
unsupervised split size : 50000


### Question 3
What is the proportion of each class on the supervised splits?

In [36]:
# Dataset 0: Train
train_data_frame = dataset_0.to_pandas()
print("Train")
print(train_data_frame.where(train_data_frame['label'] == 0).count())
print(train_data_frame.where(train_data_frame['label'] == 1).count())

print("\n")
# Dataset 1: Test
print("Test")
test_data_frame = dataset_1.to_pandas()
print(test_data_frame.where(test_data_frame['label'] == 0).count())
print(test_data_frame.where(test_data_frame['label'] == 1).count())


Train
text     12500
label    12500
dtype: int64
text     12500
label    12500
dtype: int64


Test
text     12500
label    12500
dtype: int64
text     12500
label    12500
dtype: int64


Hence, each class represents 50% of the supervised dataset (both in train and test samples).

## Naive Bayes classifier 

### Question 1
Create an adapted processing function which lower case the text and replace punctuations with text:

In [37]:
import re
from bs4 import BeautifulSoup

def clean_html(text: str) -> str:
  """Function which removes unuseful HTML tags of provided text."""
  no_html = BeautifulSoup(text).get_text()
  return no_html

def text_processing(text: str) -> str:
  """Function which  process provided text, it removes punctuations, unuseful spaces and html tags."""
  result_text = text
  result_text = clean_html(result_text)
  result_text = result_text.lower()
  pattern = r"(?<![a-zA-Z])[^\w\s]|[^\w\s](?![a-zA-Z])"
  result_text = re.sub(pattern, "", result_text)
  result_text = result_text.strip()
  return re.sub("(\s+)", " ", result_text)

#tiny test
result = text_processing("Hello,        U.S.A!")
expected = 'hello u.s.a'
assert result == expected or print(result)

Now let's apply `text_processing` function on `text` field of our dataframe.

In [38]:
train_data_frame.text = train_data_frame.text.apply(text_processing)

## Get Vocabulary

### Tokenize function

In [42]:
def tokenize(text: str)-> list:
    return [w for w in re.split("\W+", text)]

### Create Dataframe for vocabulary

In [49]:
from collections import Counter

vocabulary = Counter(tokenize(train_data_frame.text[0]))
for text in train_data_frame.text[1:]:
    word_list = tokenize(text=text)
    vocabulary.update(word_list)

Counter({'i': 86682,
         'rented': 337,
         'am': 2774,
         'curious': 261,
         'yellow': 106,
         'from': 20434,
         'my': 12419,
         'video': 1721,
         'store': 513,
         'because': 9015,
         'of': 145775,
         'all': 23814,
         'the': 335404,
         'controversy': 51,
         'that': 73078,
         'surrounded': 133,
         'it': 95638,
         'when': 14104,
         'was': 48136,
         'first': 9021,
         'released': 980,
         'in': 93726,
         '1967': 46,
         'also': 9112,
         'heard': 1109,
         'at': 23437,
         'seized': 10,
         'by': 22499,
         'u': 475,
         's': 61171,
         'customs': 34,
         'if': 16625,
         'ever': 5962,
         'tried': 771,
         'to': 135616,
         'enter': 195,
         'this': 75436,
         'country': 930,
         'therefore': 332,
         'being': 6600,
         'a': 162772,
         'fan': 1901,
         'films': 

In [54]:
len(vocabulary.keys())

90313