Fastai with HuggingFace 🤗Transformers (BERT, RoBERTa, XLNet, XLM, DistilBERT)¶

Fastai is a deep learning library built on top of PyTorch that aims to simplify the process of training models while providing high-level abstractions and powerful tools. It includes a variety of features for training models efficiently with minimal code. Below, I'll explain how to use Fastai for the same text classification task you've been working on with BERT, and then compare it with the Hugging Face Transformers approach you've already explored.

In [1]:
import fastai
import transformers
print('fastai version :', fastai.__version__)
print('transformers version :', transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


fastai version : 2.7.18
transformers version : 4.45.2


In [2]:
import pandas as pd
import numpy as np
data = pd.read_csv('../datasets/Reviews.csv') 


In [3]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
# Parameters
seed = 42
use_fp16 = False
bs = 16

# model_type = 'roberta'
# pretrained_model_name = 'roberta-base'

model_type = 'bert'
pretrained_model_name='bert-base-uncased'

# model_type = 'distilbert'
# pretrained_model_name = 'distilbert-base-uncased'

#model_type = 'xlm'
#pretrained_model_name = 'xlm-clm-enfr-1024'

# model_type = 'xlnet'
# pretrained_model_name = 'xlnet-base-cased'

In [6]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
import pandas as pd  # Assuming your data is in a pandas DataFrame

# Supposons que vous avez chargé votre dataframe sous le nom de `data`
# Exemple : data = pd.read_csv('your_data.csv')

# 1. Diviser les données en ensembles d'entraînement et de test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['Text'], data['Score'], test_size=0.2, random_state=42
)
# 2. Convert pandas series to list if necessary
train_texts = train_texts.tolist()  # Convert pandas Series to list
test_texts = test_texts.tolist()    # Same for test_texts

# 3. Ajustez les labels pour qu'ils soient entre 0 et 4 (si nécessaire)
train_labels = np.array(train_labels) - 1
test_labels = np.array(test_labels) - 1

# 4. Charger le tokenizer et le modèle BERT
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
model = BertForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=5)

# 5. Tokenizer les données d'entraînement et de test
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")

# 6. Créer le dataset personnalisé
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}  # No need for `torch.tensor` if tensors already
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)

# 7. Configurer les arguments d'entraînement
training_args = TrainingArguments(
    output_dir='./results',            # Where to save the results
    num_train_epochs=3,                # Number of training epochs
    per_device_train_batch_size=16,    # Batch size for training
    per_device_eval_batch_size=64,     # Batch size for evaluation
    warmup_steps=500,                  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                 # Strength of weight decay
    logging_dir='./logs',              # Directory for storing logs
    logging_steps=10,                  # Log every 10 steps
)

# 8. Initialiser le Trainer Hugging Face
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# 9. Entraîner le modèle
trainer.train()

# 10. Évaluer le modèle
trainer.evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 6/85269 [03:55<877:02:03, 37.03s/it] 

KeyboardInterrupt: 

In [24]:
from sklearn.metrics import accuracy_score, classification_report

# Get predictions from the model
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(test_labels, pred_labels)
print(f'Accuracy: {accuracy:.4f}')

# Classification report for detailed metrics
print(classification_report(test_labels, pred_labels))



100%|██████████| 1/1 [00:00<00:00, 1003.18it/s]

Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1






Pour intégrer Fastai avec Hugging Face dans un pipeline d'entraînement de modèle de classification de texte, vous devez utiliser l'API de Fastai, en tirant parti de ses abstractions comme les DataBlock et le Learner. Fastai offre des fonctionnalités avancées, telles que le fine_tuning, un meilleur contrôle de la gestion des données, et une gestion simplifiée des entraînements avec des taux d'apprentissage ajustables (e.g., fit_one_cycle). Voici comment vous pouvez combiner Fastai avec le modèle BERT de Hugging Face, tout en expliquant chaque partie et la valeur ajoutée de Fastai.

In [None]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from fastai.text.all import *
import torch
import numpy as np
import pandas as pd

# Supposons que vous avez chargé votre dataframe sous le nom de `data`
# Exemple : data = pd.read_csv('your_data.csv')

# 1. Diviser les données en ensembles d'entraînement et de test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['Text'], data['Score'], test_size=0.2, random_state=42
)

# 2. Convert pandas series to list if necessary
train_texts = train_texts.tolist()
test_texts = test_texts.tolist()

# 3. Ajustez les labels pour qu'ils soient entre 0 et 4 (si nécessaire)
train_labels = np.array(train_labels) - 1
test_labels = np.array(test_labels) - 1

# 4. Charger le tokenizer et le modèle BERT
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
model = BertForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=5)

# 5. Tokenizer les données d'entraînement et de test
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt", max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt", max_length=512)

# 6. Convertir les encodings en dataset PyTorch standard
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)

# 7. Créer une fonction pour transformer les encodings pour Fastai
def collate_fn(batch):
    input_ids = torch.stack([f['input_ids'] for f in batch])
    attention_mask = torch.stack([f['attention_mask'] for f in batch])
    labels = torch.stack([f['labels'] for f in batch])
    return (input_ids, attention_mask), labels

# 8. Créer un DataLoader compatible avec Fastai
dls = DataLoaders.from_dsets(train_dataset, test_dataset, bs=16, dl_type=SortedDL, create_batch=collate_fn)

# 9. Créer un Learner Fastai avec le modèle BERT
learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), metrics=accuracy)

# 10. Fine-tuning avec Fastai
learn.freeze()  # Geler les couches non-classification pour commencer
learn.fit_one_cycle(3, 2e-5)

# 11. Dégeler et ajuster toutes les couches du modèle
learn.unfreeze()
learn.fit_one_cycle(3, slice(1e-5, 2e-5))

# 12. Évaluer sur l'ensemble de test
learn.validate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
