In [1]:
from copy import deepcopy
from functools import partial
from typing import Callable, Dict, Generator, List, Tuple

from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

random_seed = 42

# FastText


In [2]:
import fasttext
import string
import random

In [3]:
dataset = load_dataset("imdb")
train_dataset = dataset["train"].train_test_split(
    stratify_by_column="label", test_size=0.2, seed=42
)
test_df = dataset["test"]
train_df = train_dataset["train"]
valid_df = train_dataset["test"]
train_df.shape, valid_df.shape, test_df.shape

Found cached dataset imdb (/home/pierre/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached split indices for dataset at /home/pierre/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-5f37fd0866e4f89f.arrow and /home/pierre/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-dd5732a0e6ac784c.arrow


((20000, 2), (5000, 2), (25000, 2))

1. (2 points) Turn the dataset into a dataset compatible with Fastext (see the Tips on using FastText section a bit lower).
For pretreatment, only apply lower casing and punctuation removal.

In [4]:
# turn the data into fasttext format : __label__0/1 + text => __label__positive/negative + text
def format_label(label: int) -> str:
    """
        Return the label in the fasttext format.
    """
    if label == 0:
        return "__label__negative"
    return "__label__positive"

def preprocessingString(text: str) -> str:
    '''
        Preprocess a string. Remove punctuation and lower the text.
    '''
    text = text.lower().replace("<br />", " ")
    for punct in string.punctuation:
        if (not punct in str("-")):
            text = text.replace(punct, " ")
    return text

def fasttext_format(df: pd.DataFrame) -> List[str]:
    """
        Return a list of strings in the fasttext format.
    """
    return [format_label(label) + " " + preprocessingString(text) + "\n" for text, label in zip(df['text'], df['label'])]

def write_fasttext_file(df: pd.DataFrame, filename: str) -> None:
    """
        Write a file in the fasttext format.
    """
    with open(filename, "w") as f:
        l = fasttext_format(df)
        random.shuffle(l)
        # sort the labels to have positive first, then negative
        l.sort(key=lambda x: x.split()[0], reverse=True)
        f.writelines(l)

# write the files
write_fasttext_file(train_df, "train.txt")
write_fasttext_file(valid_df, "valid.txt")
write_fasttext_file(test_df, "test.txt")


# print the first line of the file to check
with open("train.txt", "r") as f:
    print(f.readlines()[0])


__label__positive after reading the other tepid reviews and comments  i felt i had to come to bat for this movie   roeg s films tend to have little to do with one another  and expecting this one to be like one of his you liked is probably off the mark   what this film is is a thoughtful and unabashed look at religious faith  the only other film like it-in terms of its religious message-would have to be tolkin s  the rapture    i am astonished that anyone could say the story is muddled or supernatural  it is a simple movie about catholic faith  miracles  and redemption--though you would never guess it till the end  it is also the only movie i can think of whose resolution turns  literally  on a pun   as a  happily  fallen catholic myself  i know what the movie is about  and i find a sort of fondness in its ultimate innocence about the relation between god and man  but if you are not familiar with the kind of theology on which the film is based  then it will go right over you head   as a

2. (2 points) Train a FastText classifier with default parameters on the training data, and evaluate it on the test data using accuracy.


In [5]:
# train the model
model = fasttext.train_supervised(input="train.txt")

# print the accuracy
print(f"Accuracy on the training set: {model.test('train.txt')[1]}")
print(f"Accuracy on the test set: {model.test('test.txt')[1]}")

Read 4M words
Number of words:  86069
Number of labels: 2
Progress: 100.0% words/sec/thread: 2654061 lr:  0.000000 avg.loss:  0.359210 ETA:   0h 0m 0s


Accuracy on the training set: 0.9028
Accuracy on the test set: 0.8756


3. (2 points) Use the [hyperparameters search functionality](https://fasttext.cc/docs/en/autotune.html) of FastText and repeat step 2.
   * To do so, you'll need to [split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) your training set into a training and a validation set.
   * Let the model search for 5 minutes (it's the default search time).
   * Don't forget to shuffle (and stratify) your splits. The dataset has its entry ordered by label (0s first, then 1s). Feeding the classifier one class and then the second can mess with its performances.

In [6]:
# train using hyperparameters search
hyper_model = fasttext.train_supervised(input="train.txt", autotuneValidationFile="valid.txt")

# print the accuracy
print(f"Accuracy on the training set: {hyper_model.test('train.txt')[1]}")
print(f"Accuracy on the validation set: {hyper_model.test('valid.txt')[1]}")
print(f"Accuracy on the test set: {hyper_model.test('test.txt')[1]}")


Progress: 100.0% Trials:    9 Best score:  0.895800 ETA:   0h 0m 0s
Training again with best arguments
Read 4M words
Number of words:  86069
Number of labels: 2
Progress: 100.0% words/sec/thread: 1215143 lr:  0.000000 avg.loss:  0.042706 ETA:   0h 0m 0s


Accuracy on the training set: 1.0
Accuracy on the validation set: 0.8972
Accuracy on the test set: 0.8936


4. (1 points) Look at the differences between the default model and the attributes found with hyperparameters search. How do the two models differ?
   * Only refer to the attributes you think are interesting.
   * See the _Tips on using FastText_ (just below) for help.

In [7]:
# check model attributes
# pretty print with padding
print("Model attributes vs Second model attributes:")
print("model lr:".ljust(20), str(model.lr).ljust(30), "| hyper_model lr:".ljust(30), hyper_model.lr)
print("model dim:".ljust(20), str(model.dim).ljust(30), "| hyper_model dim:".ljust(30), hyper_model.dim)
print("model epoch:".ljust(20), str(model.epoch).ljust(30), "| hyper_model epoch:".ljust(30), hyper_model.epoch)
print("model minCount:".ljust(20), str(model.minCount).ljust(30), "| hyper_model minCount:".ljust(30), hyper_model.minCount)
print("model minCountLabel:".ljust(20), str(model.minCountLabel).ljust(30), "| hyper_model minCountLabel:".ljust(30), hyper_model.minCountLabel)
print("model minn:".ljust(20), str(model.minn).ljust(30), "| hyper_model minn:".ljust(30), hyper_model.minn)
print("model maxn:".ljust(20), str(model.maxn).ljust(30), "| hyper_model maxn:".ljust(30), hyper_model.maxn)
print("model neg:".ljust(20), str(model.neg).ljust(30), "| hyper_model neg:".ljust(30), hyper_model.neg)
print("model wordNgrams:".ljust(20), str(model.wordNgrams).ljust(30), "| hyper_model wordNgrams:".ljust(30), hyper_model.wordNgrams)
print("model loss:".ljust(20), str(model.loss).ljust(30), "| hyper_model loss:".ljust(30), hyper_model.loss)


Model attributes vs Second model attributes:
model lr:            0.1                            | hyper_model lr:              0.08499425639667486
model dim:           100                            | hyper_model dim:             92
model epoch:         5                              | hyper_model epoch:           100
model minCount:      1                              | hyper_model minCount:        1
model minCountLabel: 0                              | hyper_model minCountLabel:   0
model minn:          0                              | hyper_model minn:            0
model maxn:          0                              | hyper_model maxn:            0
model neg:           5                              | hyper_model neg:             5
model wordNgrams:    1                              | hyper_model wordNgrams:      2
model loss:          loss_name.softmax              | hyper_model loss:            loss_name.softmax


The learning rate (0.1 vs 0.085)  and the dim (100 vs 92) differs a bit, maybe these affined values are best suited for the problem than the default ones although they are still close  
the second model trains on a lot more epoch (5 vs 100), wich increases the accuracy on the train set but it leads to a bit of overtraining

5. (1 point) Using the tuned model, take at least 2 wrongly classified examples from the test set, and try explaining why the model failed.

In [8]:
# take two wrong predictions
wrong_predictions = []
labels = []
with open("test.txt", "r") as f:
    for i, line in enumerate(f.readlines()):
        predicted_label = format_label(hyper_model.predict(line.split()[1:])[0][0])
        true_label = line.split()[0]
        if predicted_label != true_label:
            print("Line : ", line)
            print("Prediction : ", predicted_label, "\nLabel : ", true_label, "\n")

            wrong_predictions.append(line)
            labels.append(line.split()[0])
            if len(wrong_predictions) == 2:
                break

Line :  __label__negative to truly appreciate this film you had to be there  acting   or have been a crew member   yes  i am  selena   and at the ripe old age of 42  have serious doubts about what we were doing did   it all started out to be like a  john waters  type thing  friends acting badly in bad films  somewhere along the line the fun discontinued  people who were supposed to be friends didn t speak anymore  and bad became worse   i regret the bad image i might have projected  try to fit in size one gold spandex pants    other than that  the film sucks so badly  i would not even make my mama watch it   to my director  cast and crew i say    why can t we just all get along    it s been over twenty years  people                    

Prediction :  __label__positive 
Label :  __label__negative 

Line :  __label__negative i couldn t even sit through the whole thing  this movie was a piece of crap  i had more fun watching  dont  tell mom the babysitter s dead   it was just too painful 

# TODO : expliquer pk c'est des mauvaises predictions

6. (Bonus point) Why is it likely that the attributes `minn` and `maxn` are at 0 after an hyperparameter search on our data?
   * Hint: on what language are we working?

In [9]:
print("model minn:".ljust(20), str(model.minn).ljust(30), "| hyper_model minn:".ljust(30), hyper_model.minn)
print("model maxn:".ljust(20), str(model.maxn).ljust(30), "| hyper_model maxn:".ljust(30), hyper_model.maxn)

model minn:          0                              | hyper_model minn:            0
model maxn:          0                              | hyper_model maxn:            0
