python -m venv venv_hackeurope
python -m ipykernel install --user --name=venv_hackeurope --display-name "Python (hackeurope)"

In [3]:
# !pip install --upgrade pip

# !pip install datasets evaluate transformers accelerate
# !pip install torch
# !pip install transformers[torch]
# !pip install scikit-learn
# !pip install matplotlib

# Processing the data (PyTorch)

In [4]:
from datasets import load_dataset

raw_datasets = load_dataset(
    "json", 
    data_files={
        "train": "train.json", 
        "validation": "valid.json", 
        "test": "test.json"
    }
)

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 8278 examples [00:03, 2412.63 examples/s]
Generating validation split: 2365 examples [00:00, 2511.37 examples/s]
Generating test split: 1183 examples [00:00, 1941.31 examples/s]


In [5]:
def preprocess_data(example):
    tweets = example.get('tweet')
    
    # 1. Gérer les valeurs manquantes ou nulles
    if not tweets: # Gère None ou une liste/chaîne vide
        first_tweet = ""
    elif isinstance(tweets, str):
        # Si c'est déjà une chaîne unique
        first_tweet = tweets
    elif isinstance(tweets, list):
        # Si c'est une liste, on prend le premier élément s'il n'est pas None
        first_tweet = str(tweets[0]) if tweets[0] is not None else ""
    else:
        # Solution de repli
        first_tweet = str(tweets)
    
    # 2. Gérer le label en toute sécurité
    raw_label = example.get('label', 0)
    integer_label = int(raw_label) if raw_label is not None else 0
    
    return {
        'text': first_tweet,
        'label': integer_label
    }

# Appliquer le prétraitement
processed_datasets = raw_datasets.map(preprocess_data)

print(processed_datasets)

Map:   0%|          | 0/8278 [00:00<?, ? examples/s]

Map: 100%|██████████| 8278/8278 [00:05<00:00, 1557.12 examples/s]
Map: 100%|██████████| 2365/2365 [00:01<00:00, 1634.46 examples/s]
Map: 100%|██████████| 1183/1183 [00:00<00:00, 1367.15 examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'profile', 'tweet', 'neighbor', 'domain', 'label', 'text'],
        num_rows: 8278
    })
    validation: Dataset({
        features: ['ID', 'profile', 'tweet', 'neighbor', 'domain', 'label', 'text'],
        num_rows: 2365
    })
    test: Dataset({
        features: ['ID', 'profile', 'tweet', 'neighbor', 'domain', 'label', 'text'],
        num_rows: 1183
    })
})





In [6]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    # Tokenize the aggregated text, enforcing truncation to BERT's maximum length
    return tokenizer(example["text"], truncation=True, max_length=512)

tokenized_datasets = processed_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 8278/8278 [00:00<00:00, 11795.77 examples/s]
Map: 100%|██████████| 2365/2365 [00:00<00:00, 6288.80 examples/s]
Map: 100%|██████████| 1183/1183 [00:00<00:00, 4821.43 examples/s]


# FINE TUNING

In [7]:
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback

metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="twitter-bert-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

# Update num_labels to match your new dataset (e.g., 2 for binary classification)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# Start training
trainer.train()

Downloading builder script: 4.20kB [00:00, 6.36MB/s]
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 660.49it/s, Materializing param=bert.pooler.dense.weight]                               
[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/a

Epoch,Training Loss,Validation Loss,Accuracy
1,0.656236,0.629363,0.658351
2,0.601702,0.627048,0.675264
3,0.522889,0.647357,0.673996


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.39it/s]
There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerN

TrainOutput(global_step=1554, training_loss=0.5901016814674956, metrics={'train_runtime': 464.0995, 'train_samples_per_second': 53.51, 'train_steps_per_second': 3.348, 'total_flos': 1133211093215040.0, 'train_loss': 0.5901016814674956, 'epoch': 3.0})

In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.542942,0.663769,0.652854
2,0.396394,0.823328,0.648203
3,0.26308,0.993744,0.65666


Writing model shards: 100%|██████████| 1/1 [00:01<00:00,  1.22s/it]
Writing model shards: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
Writing model shards: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]
There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerN

TrainOutput(global_step=1554, training_loss=0.39444071721846535, metrics={'train_runtime': 484.9816, 'train_samples_per_second': 51.206, 'train_steps_per_second': 3.204, 'total_flos': 1133211093215040.0, 'train_loss': 0.39444071721846535, 'epoch': 3.0})

In [9]:
print("--- Évaluation finale sur le jeu de TEST ---")

test_results = trainer.evaluate(tokenized_datasets["test"])
print(test_results)

--- Évaluation finale sur le jeu de TEST ---


{'eval_loss': 0.7268869876861572, 'eval_accuracy': 0.6153846153846154, 'eval_runtime': 6.8239, 'eval_samples_per_second': 173.36, 'eval_steps_per_second': 10.844, 'epoch': 3.0}


In [None]:
trainer.save_model("twitter-bert-finetuned-final")
tokenizer.save_pretrained("twitter-bert-finetuned-final")
print("Modèle final sauvegardé avec succès.")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.05it/s]

Modèle final sauvegardé avec succès.



