In [5]:
from datasets import load_dataset_builder
import datasets

ds_builder = load_dataset_builder("imdb")

The Dataset

How many split does the dataset has ?

In [6]:
nb_splits = len(ds_builder.info.splits)
print("The data set has {} splits !".format(nb_splits))

The data set has 3 splits !


How big are these splits ?

In [7]:
ds_builder.info.splits

{'train': SplitInfo(name='train', num_bytes=33432823, num_examples=25000, shard_lengths=None, dataset_name='imdb'),
 'test': SplitInfo(name='test', num_bytes=32650685, num_examples=25000, shard_lengths=None, dataset_name='imdb'),
 'unsupervised': SplitInfo(name='unsupervised', num_bytes=67106794, num_examples=50000, shard_lengths=None, dataset_name='imdb')}

The size of train split is 25000
The size of test split is 25000
The size of unsupervised split is 50000

What is the proportion of each class on the supervised splits ?

In [16]:
dataset = datasets.load_dataset('imdb')

train_dataset = dataset['train']
test_dataset = dataset['test']

train_size = len(train_dataset)
test_size = len(test_dataset)

train_pos = sum(train_dataset['label'])
train_neg = train_size - train_pos
test_pos = sum(test_dataset['label'])
test_neg = test_size - test_pos

# Print the results

print('Train size: {} Train pos: {} Train neg: {}'.format(train_size, train_pos, train_neg))
print('Test size: {} Test pos: {} Test neg: {}'.format(test_size, test_pos, test_neg))


Found cached dataset imdb (C:/Users/$0NT000-3G88GUGFR0B8/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 53.36it/s]

Train size: 25000 Train pos: 12500 Train neg: 12500
Test size: 25000 Test pos: 12500 Test neg: 12500





As we can see the dataset contains the column "text" (that contains string) and the colums "label" (that contains class "neg" or "pos") that indicate if it's a positive or a negative comment

Train split contains 12500 pos (50%) and 12500 neg (50%)
Test split contains 12500 pos (50%) and 12500 neg (50%)

Lower case the text and replace punctuation with space

In [8]:
def lowercase_punctuation(text):
    newtext = text.lower()
    # remove <br /> tags
    newtext = newtext.replace('<br />', '')
    # remove punctuation
    newtext = ''.join([c if c not in "!\"#$%&()*+,./:;<=>?@[\]^_`{|}~" else ' ' for c in newtext])
    return newtext


Implement your own naive Bayes classifier from scratch.

In [9]:
import numpy as np
from collections import Counter

def get_all_words(dataset):
    all_words = []
    for obj in dataset:
        all_words += lowercase_punctuation(obj["text"]).split()
    #remove double
    all_words = list(set(all_words))
    return all_words

def bigdoc(dataset, feature):
    bigdoc = []
    for obj in dataset:
        if (obj["label"] == feature):
            bigdoc += lowercase_punctuation(obj["text"]).split() # pre process and add all words of the document to the bigdoc
    return bigdoc

def train_naive_bayes_classifier_scratch(dataset, features):
    words = get_all_words(dataset)
    nbwords = len(words)
    logprior = np.zeros(len(features))
    loglikelihood = np.zeros((len(features), len(words)))
    ndoc = len(dataset)
    for feature in features:
        nc = dataset["label"].count(feature) # number of documents with class c
        logprior[feature] = np.log(nc/ndoc)
        bigd = bigdoc(dataset, feature)
        lgbigd = len(bigd)
        count = Counter(bigd)
        for i, word in enumerate(words):
            c = count[word]
            loglikelihood[feature, i] = np.log((c + 1)/(lgbigd + nbwords))
    return logprior, loglikelihood, words


train_dataset = dataset['train']
logprior, loglikelihood, words = train_naive_bayes_classifier_scratch(train_dataset, [0, 1])

In [10]:
def predict_string(test_string, logprior, loglikelihood, C, vocab_dict):
    sum = np.zeros(len(C))
    for c in C:
        sum[c] = logprior[c]
        target_word_list = lowercase_punctuation(test_string).split()
        for word in target_word_list:
            if word in vocab_dict:
                sum[c] = sum[c] + loglikelihood[c, vocab_dict[word]]
    return np.argmax(sum)

def test_naive_bayes_classifier_scratch(test_dataset, logprior, loglikelihood, C, V):
    word_dict = {word: i for i, word in enumerate(V)}
    total = len(test_dataset)
    correct = 0
    for obj in test_dataset:
        pred = predict_string(obj["text"], logprior, loglikelihood, C, word_dict)
        if pred == obj["label"]:
            correct += 1
    print("Accuracy: {}".format(correct/total))

test_dataset = dataset['test']
test_naive_bayes_classifier_scratch(test_dataset, logprior, loglikelihood, [0, 1], words)

Accuracy: 0.81504


 Implement a naive Bayes classifier using scikit-learn

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

train_dataset = dataset['train']

def train_naive_bayes_classifier_scikit(train_dataset):
    vectorizer = CountVectorizer(lowercase=True, stop_words="english")
    X_train = vectorizer.fit_transform(train_dataset["text"])
    model = MultinomialNB()
    model.fit(X_train, train_dataset["label"])
    pipeline = make_pipeline(vectorizer, model)
    return pipeline

pipeline = train_naive_bayes_classifier_scikit(train_dataset)



In [12]:
def test_naive_bayes_classifier_scikit(test_dataset, pipeline):
    correct = 0
    total = len(test_dataset)
    for obj in test_dataset:
        pred = pipeline.predict([obj["text"]])
        if pred == obj["label"]:
            correct += 1
    print("Accuracy: {}".format(correct/total))

test_dataset = dataset['test']
test_naive_bayes_classifier_scikit(test_dataset, pipeline)

Accuracy: 0.81968


Naive Bayes Classifier from scratch : accuracy = 0.815 (81.5% correct)
Naive Bayes Classifier SciKit : accuracy = 0.819 (81.9% correct)

The difference can be caused by the pre processing of the text. Scikit for example remove words that are useless like "the" or "and" for example. This can lead to better accuracy. Scikit also have differents features like the CoutnVectorizer that allows to get more context in each text. So Scikit is a bit more accurate.

In [14]:
def find_wrong_prediction_scratch(test_dataset, logprior, loglikelihood, C, V):
    i = 0
    word_dict = {word: i for i, word in enumerate(V)}
    for obj in test_dataset:
        pred = predict_string(obj["text"], logprior, loglikelihood, C, word_dict)
        if pred != obj["label"]:
            print("Text: {}".format(obj["text"]))
            print("Predicted: {}".format(pred))
            print("Actual: {}".format(obj["label"]))
            print("")
            i += 1
            if (i == 2):
                break

test_dataset = dataset['test']
find_wrong_prediction_scratch(test_dataset, logprior, loglikelihood, [0, 1], words)

Text: Isaac Florentine has made some of the best western Martial Arts action movies ever produced. In particular US Seals 2, Cold Harvest, Special Forces and Undisputed 2 are all action classics. You can tell Isaac has a real passion for the genre and his films are always eventful, creative and sharp affairs, with some of the best fight sequences an action fan could hope for. In particular he has found a muse with Scott Adkins, as talented an actor and action performer as you could hope for. This is borne out with Special Forces and Undisputed 2, but unfortunately The Shepherd just doesn't live up to their abilities.<br /><br />There is no doubt that JCVD looks better here fight-wise than he has done in years, especially in the fight he has (for pretty much no reason) in a prison cell, and in the final showdown with Scott, but look in his eyes. JCVD seems to be dead inside. There's nothing in his eyes at all. It's like he just doesn't care about anything throughout the whole film. And 

Both of those comments are quite lengthy and contain both positive and negative words. The Naive Bayes classifier cannot accurately interpret the context of the expressions, so it is normal that it fails.

## Stemming

In [15]:
import re
import nltk
import datasets
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize 

nltk.download('punkt')

def stemming(dataset: datasets.arrow_dataset) -> datasets.arrow_dataset:
    re_word = re.compile(r"^\w+$")
    stemmer = SnowballStemmer("english")

    stemmed = [stemmer.stem(word) for word in word_tokenize(dataset['text'].lower()) if re_word.match(word)]
    dataset['text'] = " ".join(stemmed)

    return dataset

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\$0NT000-
[nltk_data]     3G88GUGFR0B8\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
stemmed_train = train_dataset.map(stemming)
stemmed_test = test_dataset.map(stemming)

Loading cached processed dataset at C:\Users\$0NT000-3G88GUGFR0B8\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-ec50cdf0016b7d38.arrow
                                                                  

In [18]:
trained_stem = train_naive_bayes_classifier_scikit(stemmed_train)
test_naive_bayes_classifier_scikit(stemmed_test, trained_stem)

Accuracy: 0.81244


Scikit without stemming : accuracy = 0.819 (81.9%)
Scikit with stemming : accuracy =  0.812 (81.2%)

The stemming is a process that reduce words to their root form by cutting prefixes and suffixes.
In some case stemming can lead to a significant loss of information or context ("found" or "finder" will become "find" for example). So this can decrease the model's accuracy.