python -m venv venv_hackeurope
python -m ipykernel install --user --name=venv_hackeurope --display-name "Python (hackeurope)"

In [4]:
# !pip install --upgrade pip

# !pip install datasets evaluate transformers accelerate
# !pip install torch
# !pip install transformers[torch]
# !pip install scikit-learn
# !pip install matplotlib

# Processing the data (PyTorch)

In [5]:
from datasets import load_dataset

raw_datasets = load_dataset(
    "json", 
    data_files={
        "train": "train.json", 
        "validation": "dev.json", 
        "test": "test.json"
    }
)

Generating train split: 8278 examples [00:02, 3453.13 examples/s]
Generating validation split: 2365 examples [00:00, 3419.26 examples/s]
Generating test split: 1183 examples [00:00, 3195.01 examples/s]


In [None]:
from datasets import DatasetDict

def explode_tweets(examples):
    new_examples = {"ID": [], "text": [], "label": []}
    
    # On limite à 5 tweets par personne pour que ça aille très vite
    MAX_TWEETS_PER_USER = 50  
    
    for user_id, tweets, label in zip(examples["ID"], examples["tweet"], examples["label"]):
        if not tweets: 
            continue
            
        if isinstance(tweets, str): 
            tweets = [tweets]
            
        tweets_to_keep = tweets[:MAX_TWEETS_PER_USER]
            
        for tweet in tweets_to_keep:
            if tweet: 
                new_examples["ID"].append(user_id)
                new_examples["text"].append(str(tweet))
                new_examples["label"].append(int(label) if label is not None else 0)
                
    return new_examples

# --- C'EST ICI QUE SE FAIT LA RÉDUCTION ---

# 1. On réduit la taille des jeux de données d'origine
small_train = raw_datasets["train"].select(range(4000))
small_valid = raw_datasets["validation"].select(range(500))
small_test = raw_datasets["test"].select(range(500)) # On réduit le test aussi pour que l'évaluation soit rapide

# 2. On les regroupe dans un DatasetDict (pour que la cellule de tokenisation fonctionne normalement)
small_raw_datasets = DatasetDict({
    "train": small_train,
    "validation": small_valid,
    "test": small_test
})

# 3. On applique notre fonction uniquement sur ce petit dataset
processed_datasets = small_raw_datasets.map(
    explode_tweets, 
    batched=True, 
    remove_columns=raw_datasets["train"].column_names
)

print(processed_datasets)

Map: 100%|██████████| 1000/1000 [00:00<00:00, 4388.87 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 3629.04 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 4465.09 examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'label', 'text'],
        num_rows: 23371
    })
    validation: Dataset({
        features: ['ID', 'label', 'text'],
        num_rows: 4832
    })
    test: Dataset({
        features: ['ID', 'label', 'text'],
        num_rows: 4565
    })
})





In [7]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = processed_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 23371/23371 [00:00<00:00, 26800.20 examples/s]
Map: 100%|██████████| 4832/4832 [00:00<00:00, 7599.36 examples/s]
Map: 100%|██████████| 4565/4565 [00:00<00:00, 24266.53 examples/s]


# FINE TUNING

In [None]:
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback

metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="twitter-bert-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

# Update num_labels to match your new dataset (e.g., 2 for binary classification)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()

Downloading builder script: 4.20kB [00:00, 4.21MB/s]
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 684.57it/s, Materializing param=bert.pooler.dense.weight]                               
[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/a

Epoch,Training Loss,Validation Loss,Accuracy
1,0.527411,0.561895,0.716887


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.32it/s]
There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer

TrainOutput(global_step=1461, training_loss=0.5332816957529892, metrics={'train_runtime': 666.2721, 'train_samples_per_second': 35.077, 'train_steps_per_second': 2.193, 'total_flos': 1109917028706420.0, 'train_loss': 0.5332816957529892, 'epoch': 1.0})

In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.410854,0.596709,0.721233


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.30it/s]
There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer

TrainOutput(global_step=1461, training_loss=0.4238181822435404, metrics={'train_runtime': 664.1029, 'train_samples_per_second': 35.192, 'train_steps_per_second': 2.2, 'total_flos': 1109917028706420.0, 'train_loss': 0.4238181822435404, 'epoch': 1.0})

In [10]:
import pandas as pd
import numpy as np

print("--- Génération des prédictions sur le jeu de TEST ---")
# 1. Obtenir les prédictions pour chaque tweet
predictions_output = trainer.predict(tokenized_datasets["test"])
tweet_predictions = np.argmax(predictions_output.predictions, axis=-1)

# 2. Mettre les résultats dans un DataFrame Pandas pour manipuler facilement
df_test = pd.DataFrame({
    "ID": tokenized_datasets["test"]["ID"],
    "pred_tweet": tweet_predictions,
    "true_label": tokenized_datasets["test"]["label"]
})

# 3. Regrouper par ID utilisateur et faire le Vote Majoritaire
user_predictions = df_test.groupby("ID").agg(
    # Le mode() prend la valeur la plus fréquente. [0] prend la première en cas d'égalité
    majority_pred=("pred_tweet", lambda x: x.mode()[0]), 
    true_label=("true_label", "first") # Le vrai label est le même pour tous les tweets de l'ID
)

# 4. Calculer l'accuracy finale au niveau utilisateur
correct_predictions = (user_predictions["majority_pred"] == user_predictions["true_label"]).sum()
total_users = len(user_predictions)
final_accuracy = correct_predictions / total_users

print(f"\\n✅ ÉVALUATION FINALE (Niveau Utilisateur - Vote Majoritaire)")
print(f"Total d'utilisateurs testés : {total_users}")
print(f"Précision (Accuracy) : {final_accuracy * 100:.2f}%")

--- Génération des prédictions sur le jeu de TEST ---


\n✅ ÉVALUATION FINALE (Niveau Utilisateur - Vote Majoritaire)
Total d'utilisateurs testés : 198
Précision (Accuracy) : 64.65%


In [11]:
trainer.save_model("twitter-bert-finetuned-final")
tokenizer.save_pretrained("twitter-bert-finetuned-final")
print("Modèle final sauvegardé avec succès.")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.47it/s]

Modèle final sauvegardé avec succès.





In [None]:
# Force la sauvegarde au format safetensors dans un dossier spécifique
dossier_sauvegarde = "mon_modele_safetensors_big"

model.save_pretrained(dossier_sauvegarde, safe_serialization=True)
tokenizer.save_pretrained(dossier_sauvegarde)

print(f"✅ Modèle et Tokenizer sauvegardés proprement dans le dossier : {dossier_sauvegarde}")

Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.86it/s]

✅ Modèle et Tokenizer sauvegardés proprement dans le dossier : mon_modele_safetensors





pour predire utiliser:

In [None]:
# from transformers import AutoModelForSequenceClassification, AutoTokenizer

# # Hugging Face va automatiquement détecter et charger le fichier .safetensors
# loaded_model = AutoModelForSequenceClassification.from_pretrained("mon_modele_safetensors")
# loaded_tokenizer = AutoTokenizer.from_pretrained("mon_modele_safetensors")