In [1]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Vérifier si un GPU est disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
path_to_data = "../../challenge_data/"
path_to_training_tweets = path_to_data + "train_tweets/AustraliaNetherlands29.csv"
df = pd.read_csv(path_to_training_tweets)
df = df[['Tweet', 'EventType']]
train_data, eval_data = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')

# Tokenizer les tweets
def tokenize_function(examples):
    return tokenizer(examples['Tweet'], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 77467/77467 [00:12<00:00, 5995.56 examples/s]
Map: 100%|██████████| 19367/19367 [00:03<00:00, 5963.09 examples/s]


In [5]:
# Encoder les labels d'EventType
train_dataset = train_dataset.map(lambda x: {'label': x['EventType']})
eval_dataset = eval_dataset.map(lambda x: {'label': x['EventType']})

Map: 100%|██████████| 77467/77467 [00:10<00:00, 7295.66 examples/s] 
Map: 100%|██████████| 19367/19367 [00:02<00:00, 7687.86 examples/s]


In [6]:
# Convertir les labels en format numérique
train_dataset = train_dataset.class_encode_column("label")
eval_dataset = eval_dataset.class_encode_column("label")

Stringifying the column: 100%|██████████| 77467/77467 [00:00<00:00, 244892.10 examples/s]
Casting to class labels: 100%|██████████| 77467/77467 [00:00<00:00, 237425.17 examples/s]
Stringifying the column: 100%|██████████| 19367/19367 [00:00<00:00, 260265.12 examples/s]
Casting to class labels: 100%|██████████| 19367/19367 [00:00<00:00, 244372.37 examples/s]


In [7]:
# Charger le modèle Bertweet pour la classification
model = AutoModelForSequenceClassification.from_pretrained('vinai/bertweet-base', num_labels=len(train_data['EventType'].unique()))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Déplacer le modèle sur le GPU
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [9]:
# Définir les arguments pour l'entraînement
training_args = TrainingArguments(
    fp16=True,
    save_steps=1000,
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

In [10]:
# Créer le trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [11]:
# Fine-tuning du modèle
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4922,0.491635


TrainOutput(global_step=4842, training_loss=0.5042790314817369, metrics={'train_runtime': 679.4467, 'train_samples_per_second': 114.015, 'train_steps_per_second': 7.126, 'total_flos': 5095606031393280.0, 'train_loss': 0.5042790314817369, 'epoch': 1.0})

In [15]:
# Sauvegarder le modèle fine-tuné
model.save_pretrained("./finetuned_bertweet")
tokenizer.save_pretrained("./finetuned_bertweet")

('./finetuned_bertweet/tokenizer_config.json',
 './finetuned_bertweet/special_tokens_map.json',
 './finetuned_bertweet/vocab.txt',
 './finetuned_bertweet/bpe.codes',
 './finetuned_bertweet/added_tokens.json')