In [1]:
from copy import deepcopy
from functools import partial
from typing import Callable, Dict, Generator, List, Tuple
from sklearn.model_selection import train_test_split

from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import torch
from torch import nn
from torchtext import vocab
from torchtext.vocab import GloVe
from torchtext.data.utils import get_tokenizer

import math

from tqdm.auto import tqdm

dataset = load_dataset("imdb")


  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset imdb (/Users/nanditraore/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 60.49it/s]


# FastText

In [2]:
import fasttext as ft

1- Turn the dataset into a dataset compatible with Fastext

In [3]:
import string

def preprocess(data : dict) -> dict:
    '''Takes a document from a dataset, lowers the letters and
    replace all punctuations by spaces'''
    text = data["text"]
    for character in string.punctuation:
        if character != "-":
          text = text.replace(character, ' ')
    data["text"] = text.lower()
    return data

In [4]:
updated_dataset = dataset.map(preprocess)


Loading cached processed dataset at /Users/nanditraore/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-ab3a7119bbf33038.arrow
Loading cached processed dataset at /Users/nanditraore/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-4b4bf0f1f7f30d24.arrow
Loading cached processed dataset at /Users/nanditraore/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-ed3318b52d5380e8.arrow


In [5]:
def to_ft_dataset(dataset, file):
    with open(file, 'w') as f:
        for d in dataset:
            label = '__label__' + str(d['label'])
            text = d['text']
            f.write(label + ' ' + text + '\n')
    return file

2- Train a FastText classifier with default parameters

In [6]:
train_ft_file = to_ft_dataset(updated_dataset["train"], "ft_train_dataset.txt")
test_ft_file = to_ft_dataset(updated_dataset["test"], "ft_test_dataset.txt")

In [7]:
import random

with open("ft_train_dataset.txt", 'r') as f:
    train_lines = f.readlines()

# Shuffle the training data
shuffle(train_lines)
with open("ft_train_dataset.txt", 'w') as f:
    f.writelines(train_lines)

In [8]:
model = ft.train_supervised(input="ft_train_dataset.txt")

Read 6M words
Number of words:  96171
Number of labels: 2
Progress: 100.0% words/sec/thread: 2808106 lr:  0.000000 avg.loss:  0.327647 ETA:   0h 0m 0s


In [9]:
_, p, _ = model.test(test_ft_file)
print("L'accuracy est de:", p, "%")

L'accuracy est de: 0.8736 %


3- Use the hyperparameters search functionality of FastText

In [10]:

with open("ft_train_dataset.txt", 'r') as f:
    data = []
    labels = []
    for line in f:
        label, text = line.split(maxsplit=1)
        data.append(text)
        labels.append(label)

X_train, X_valid, Y_train, Y_valid = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

# Combine the data and labels into the format expected by FastText
train = [f"{label} {text.strip()}\n" for label, text in zip(Y_train, X_train)]
val = [f"{label} {text.strip()}\n" for label, text in zip(Y_valid, X_valid)]

# Save the training and validation data to separate files
with open('ft_train_dataset.txt', 'w') as f:
    f.writelines(train)
with open('ft_valid_dataset.txt', 'w') as f:
    f.writelines(val)


In [11]:
model_hyperparameters = ft.train_supervised(input='ft_train_dataset.txt', autotuneValidationFile='ft_valid_dataset.txt', autotuneDuration=300)


Aborting autotune...

Training again with best arguments
Read 4M words
Number of words:  86168
Number of labels: 2
Progress: 100.0% words/sec/thread: 2352775 lr:  0.000000 avg.loss:  0.163811 ETA:   0h 0m 0s


In [15]:
_, p, _ = model_hyperparameters.test("ft_valid_dataset.txt")
print("L'accuracy est de:", p, "%")

L'accuracy est de: 0.8962 %


4-  Look at the differences between the default model and the attributes found with hyperparameters search. How do the two models differ ?

5- Using the tuned model, take at least 2 wrongly classified examples from the test set, and try explaining why the model failed.

In [22]:
test_data = "ft_test_dataset.txt"
nb = 0
with open(test_data, 'r', encoding='utf-8') as f:
    for line in f:
        if (nb == 2):
            break
        line = line.strip().split('\t')
        text = line[0]
        label = line[1]
        prediction = model.predict(text)[0][0]
        if prediction != label:
            print("Texte:", text)
            print("Vrai label:", label)
            print("Prédiction:", prediction)
        nb += 1

IndexError: list index out of range