In [1]:
from copy import deepcopy
from functools import partial
from typing import Callable, Dict, Generator, List, Tuple

from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

random_seed = 42

  from .autonotebook import tqdm as notebook_tqdm


# FastText


In [2]:
import fasttext
import string
import random

# Datasets and splits

In [3]:
dataset = load_dataset("imdb")
train_dataset = dataset["train"].train_test_split(
    stratify_by_column="label", test_size=0.2, seed=42
)
test_df = dataset["test"]
train_df = train_dataset["train"]
valid_df = train_dataset["test"]
train_df.shape, valid_df.shape, test_df.shape

Found cached dataset imdb (/home/pili/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 850.02it/s]
Loading cached split indices for dataset at /home/pili/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-5f37fd0866e4f89f.arrow and /home/pili/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-dd5732a0e6ac784c.arrow


((20000, 2), (5000, 2), (25000, 2))

1. (2 points) Turn the dataset into a dataset compatible with Fastext (see the Tips on using FastText section a bit lower).
For pretreatment, only apply lower casing and punctuation removal.

In [17]:
# turn the data into fasttext format : __label__0/1 + text => __label__positive/negative + text
def format_label(label: int) -> str:
    """
        Return the label in the fasttext format.
    """
    if label == 0:
        return "__label__negative"
    return "__label__positive"

def preprocessingString(text: str) -> str:
    '''
        Preprocess a string. Remove punctuation and lower the text.
    '''
    text = text.lower().replace("<br />", " ")
    for punct in string.punctuation:
        if (not punct in str("-")):
            text = text.replace(punct, " ")
    return text

def fasttext_format(df: pd.DataFrame) -> List[str]:
    """
        Return a list of strings in the fasttext format.
    """
    return [format_label(label) + " " + preprocessingString(text) + "\n" for text, label in zip(df['text'], df['label'])]

def write_fasttext_file(df: pd.DataFrame, filename: str) -> None:
    """
        Write a file in the fasttext format.
    """
    with open(filename, "w") as f:
        l = fasttext_format(df)
        random.shuffle(l)
        # sort the labels to have positive first, then negative
        l.sort(key=lambda x: x.split()[0], reverse=True)
        f.writelines(l)

# write the files
write_fasttext_file(train_df, "train.txt")
write_fasttext_file(valid_df, "valid.txt")
write_fasttext_file(test_df, "test.txt")


# print the first line of the file to check
with open("train.txt", "r") as f:
    print(f.readlines()[0])


__label__positive sterling and younger brother try to survive on land  being squeezed by big cattlemen  when  rogue  brother preston arrives  a moral dilemma ensues  john  drew  barrymore steals the show as the younger  impressionable brother-barrymore shows signs here that he could have been an acting powerhouse  moves at a nice pace to an exciting climax 



2. (2 points) Train a FastText classifier with default parameters on the training data, and evaluate it on the test data using accuracy.


In [5]:
# train the model
model = fasttext.train_supervised(input="train.txt")

# print the accuracy
print(f"Accuracy on the training set: {model.test('train.txt')[1]}")
print(f"Accuracy on the test set: {model.test('test.txt')[1]}")

Read 4M words
Number of words:  86069
Number of labels: 2
Progress: 100.0% words/sec/thread: 3100510 lr:  0.000000 avg.loss:  0.401143 ETA:   0h 0m 0s

Accuracy on the training set: 0.8959
Accuracy on the test set: 0.8696


s


3. (2 points) Use the [hyperparameters search functionality](https://fasttext.cc/docs/en/autotune.html) of FastText and repeat step 2.
   * To do so, you'll need to [split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) your training set into a training and a validation set.
   * Let the model search for 5 minutes (it's the default search time).
   * Don't forget to shuffle (and stratify) your splits. The dataset has its entry ordered by label (0s first, then 1s). Feeding the classifier one class and then the second can mess with its performances.

In [6]:
# train using hyperparameters search
hyper_model = fasttext.train_supervised(input="train.txt", autotuneValidationFile="valid.txt")

# print the accuracy
print(f"Accuracy on the training set: {hyper_model.test('train.txt')[1]}")
print(f"Accuracy on the validation set: {hyper_model.test('valid.txt')[1]}")
print(f"Accuracy on the test set: {hyper_model.test('test.txt')[1]}")


Progress: 100.0% Trials:    9 Best score:  0.895800 ETA:   0h 0m 0s
Training again with best arguments
Read 4M words
Number of words:  86069
Number of labels: 2
Progress: 100.0% words/sec/thread: 1065783 lr:  0.000000 avg.loss:  0.036241 ETA:   0h 0m 0s


Accuracy on the training set: 1.0
Accuracy on the validation set: 0.896
Accuracy on the test set: 0.89584


4. (1 points) Look at the differences between the default model and the attributes found with hyperparameters search. How do the two models differ?
   * Only refer to the attributes you think are interesting.
   * See the _Tips on using FastText_ (just below) for help.

In [7]:
# check model attributes
# pretty print with padding
print("Model attributes vs Second model attributes:")
print("model lr:".ljust(20), str(model.lr).ljust(30), "| hyper_model lr:".ljust(30), hyper_model.lr)
print("model dim:".ljust(20), str(model.dim).ljust(30), "| hyper_model dim:".ljust(30), hyper_model.dim)
print("model epoch:".ljust(20), str(model.epoch).ljust(30), "| hyper_model epoch:".ljust(30), hyper_model.epoch)
print("model minCount:".ljust(20), str(model.minCount).ljust(30), "| hyper_model minCount:".ljust(30), hyper_model.minCount)
print("model minCountLabel:".ljust(20), str(model.minCountLabel).ljust(30), "| hyper_model minCountLabel:".ljust(30), hyper_model.minCountLabel)
print("model minn:".ljust(20), str(model.minn).ljust(30), "| hyper_model minn:".ljust(30), hyper_model.minn)
print("model maxn:".ljust(20), str(model.maxn).ljust(30), "| hyper_model maxn:".ljust(30), hyper_model.maxn)
print("model neg:".ljust(20), str(model.neg).ljust(30), "| hyper_model neg:".ljust(30), hyper_model.neg)
print("model wordNgrams:".ljust(20), str(model.wordNgrams).ljust(30), "| hyper_model wordNgrams:".ljust(30), hyper_model.wordNgrams)
print("model loss:".ljust(20), str(model.loss).ljust(30), "| hyper_model loss:".ljust(30), hyper_model.loss)


Model attributes vs Second model attributes:
model lr:            0.1                            | hyper_model lr:              0.08499425639667486
model dim:           100                            | hyper_model dim:             92
model epoch:         5                              | hyper_model epoch:           100
model minCount:      1                              | hyper_model minCount:        1
model minCountLabel: 0                              | hyper_model minCountLabel:   0
model minn:          0                              | hyper_model minn:            0
model maxn:          0                              | hyper_model maxn:            0
model neg:           5                              | hyper_model neg:             5
model wordNgrams:    1                              | hyper_model wordNgrams:      2
model loss:          loss_name.softmax              | hyper_model loss:            loss_name.softmax


The learning rate (0.1 vs 0.085)  and the dim (100 vs 92) differs a bit, maybe these affined values are best suited for the problem than the default ones although they are still close  
the second model trains on a lot more epoch (5 vs 100), wich increases the accuracy on the train set but it leads to a bit of overtraining

5. (1 point) Using the tuned model, take at least 2 wrongly classified examples from the test set, and try explaining why the model failed.

In [19]:
# take two wrong predictions
wrong_predictions = []
labels = []
with open("test.txt", "r") as f:
    for i, line in enumerate(f.readlines()):
        predicted_label = hyper_model.predict(line.split()[1:])[0][0][0]
        true_label = line.split()[0]
        if predicted_label != true_label:
            print("Line : ", line)
            print("Prediction : ", predicted_label, "\nLabel : ", true_label, "\n")

            wrong_predictions.append(line)
            labels.append(line.split()[0])
            if len(wrong_predictions) == 2:
                break

Line :  __label__positive an interesting companion piece to true documentaries of john c  holmes  unfortunately  it doesn t deal with what ultimately killed holmes  and it certainly could have benefited from doing so  burt reynolds and mark wahlberg got the most praise for this  but i felt the true stars were julianne moore as the cocaine-sniffing mother wannabe  don cheadle as a black man struggling with identity as pornstar stereo-salesman in some wild getups and william h  macy  who s wife is the ultimate slut  not to mention a nearly unrecognizable alfred molina  macy s new year s eve bash and cheadle s chance for a better life after a donut shop robbery gone wildly wrong are probably the two best scenes in the movie  or at least the two best shot  what this movie does best is show how power can easily corrupt in its various forms  however  none of the characters apparently learn anything from their dark downward spiral as they all rebound and return to their normal lives 

Predict

#### Because of the shuffle the results are not reprodictable

### Above we have two examples of wrong predictions :  
#### The first one :  
an interesting companion piece to true documentaries of john c  holmes  unfortunately  it doesn t deal with what ultimately killed holmes  and it certainly could have benefited from doing so  burt reynolds and mark wahlberg got the most praise for this  but i felt the true stars were julianne moore as the cocaine-sniffing mother wannabe  don cheadle as a black man struggling with identity as pornstar stereo-salesman in some wild getups and william h  macy  who s wife is the ultimate slut  not to mention a nearly unrecognizable alfred molina  macy s new year s eve bash and cheadle s chance for a better life after a donut shop robbery gone wildly wrong are probably the two best scenes in the movie  or at least the two best shot  what this movie does best is show how power can easily corrupt in its various forms  however  none of the characters apparently learn anything from their dark downward spiral as they all rebound and return to their normal lives

#### is labelled as negative but our code labelled it as positive, surely because the review is neutral

#### The second one :  
this is another film where the cinematography is the best thing to recommend it  that would be fine if the film were a travelogue  but as a dramatic exercise in cinematic artistry  that is not good enough  the theme of inter-species respect and co-operation ventures timidly into the forbidden world of inter-species love  but its approach is stereotypical  indicating a lack of understanding of the behavior motives of either species  as with many films  one always wonders what could have been achieved by a more innovative director and a more creative screenwriter  alas  we probably will never know 

#### is labelled as negative but our code labelled it as positive, it may be because the review starts with a simingly very positive sentence
￼Votre réponse


6. (Bonus point) Why is it likely that the attributes `minn` and `maxn` are at 0 after an hyperparameter search on our data?
   * Hint: on what language are we working?

In [9]:
print("model minn:".ljust(20), str(model.minn).ljust(30), "| hyper_model minn:".ljust(30), hyper_model.minn)
print("model maxn:".ljust(20), str(model.maxn).ljust(30), "| hyper_model maxn:".ljust(30), hyper_model.maxn)

model minn:          0                              | hyper_model minn:            0
model maxn:          0                              | hyper_model maxn:            0


The little we understood about maxn and minn is that they refers to the char n-grams, a char n-gram is a n-length sequence from a word, since we are working in english woth no defined length for words it may seems useless in our case and it may expalin why they stay at 0