python -m venv venv_hackeurope
python -m ipykernel install --user --name=venv_hackeurope --display-name "Python (hackeurope)"

In [16]:
# !pip install --upgrade pip

# !pip install datasets evaluate transformers accelerate
# !pip install torch
# !pip install transformers[torch]
# !pip install scikit-learn
# !pip install matplotlib

# Processing the data (PyTorch)

In [17]:
from datasets import load_dataset

raw_datasets = load_dataset(
    "json", 
    data_files={
        "train": "train.json", 
        "validation": "dev.json", 
        "test": "test.json"
    }
)

In [None]:
from datasets import DatasetDict

def explode_tweets(examples):
    new_examples = {"ID": [], "text": [], "label": []}
    
    # On limite à 5 tweets par personne pour que ça aille très vite
    MAX_TWEETS_PER_USER = 25  
    
    for user_id, tweets, label in zip(examples["ID"], examples["tweet"], examples["label"]):
        if not tweets: 
            continue
            
        if isinstance(tweets, str): 
            tweets = [tweets]
            
        tweets_to_keep = tweets[:MAX_TWEETS_PER_USER]
            
        for tweet in tweets_to_keep:
            if tweet: 
                new_examples["ID"].append(user_id)
                new_examples["text"].append(str(tweet))
                new_examples["label"].append(int(label) if label is not None else 0)
                
    return new_examples

# --- C'EST ICI QUE SE FAIT LA RÉDUCTION ---

# 1. On réduit la taille des jeux de données d'origine
small_train = raw_datasets["train"].select(range(1000))
small_valid = raw_datasets["validation"].select(range(200))
small_test = raw_datasets["test"].select(range(200)) # On réduit le test aussi pour que l'évaluation soit rapide

# 2. On les regroupe dans un DatasetDict (pour que la cellule de tokenisation fonctionne normalement)
small_raw_datasets = DatasetDict({
    "train": small_train,
    "validation": small_valid,
    "test": small_test
})

# 3. On applique notre fonction uniquement sur ce petit dataset
processed_datasets = small_raw_datasets.map(
    explode_tweets, 
    batched=True, 
    remove_columns=raw_datasets["train"].column_names
)

print(processed_datasets)

Map: 100%|██████████| 500/500 [00:00<00:00, 3574.59 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 3663.40 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 4165.52 examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'label', 'text'],
        num_rows: 2422
    })
    validation: Dataset({
        features: ['ID', 'label', 'text'],
        num_rows: 496
    })
    test: Dataset({
        features: ['ID', 'label', 'text'],
        num_rows: 480
    })
})





In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = processed_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 2422/2422 [00:00<00:00, 22316.89 examples/s]
Map: 100%|██████████| 496/496 [00:00<00:00, 19663.28 examples/s]
Map: 100%|██████████| 480/480 [00:00<00:00, 15168.93 examples/s]


# FINE TUNING

In [20]:
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback

metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="twitter-bert-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

# Update num_labels to match your new dataset (e.g., 2 for binary classification)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 715.85it/s, Materializing param=bert.pooler.dense.weight]                               
[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- M

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.472939,0.814516


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.97it/s]
There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer

TrainOutput(global_step=152, training_loss=0.5929929833663138, metrics={'train_runtime': 75.4802, 'train_samples_per_second': 32.088, 'train_steps_per_second': 2.014, 'total_flos': 122399057397960.0, 'train_loss': 0.5929929833663138, 'epoch': 1.0})

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.44668,0.804435


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer

TrainOutput(global_step=152, training_loss=0.5368878213982833, metrics={'train_runtime': 76.8534, 'train_samples_per_second': 31.515, 'train_steps_per_second': 1.978, 'total_flos': 122399057397960.0, 'train_loss': 0.5368878213982833, 'epoch': 1.0})

In [22]:
import pandas as pd
import numpy as np

print("--- Génération des prédictions sur le jeu de TEST ---")
# 1. Obtenir les prédictions pour chaque tweet
predictions_output = trainer.predict(tokenized_datasets["test"])
tweet_predictions = np.argmax(predictions_output.predictions, axis=-1)

# 2. Mettre les résultats dans un DataFrame Pandas pour manipuler facilement
df_test = pd.DataFrame({
    "ID": tokenized_datasets["test"]["ID"],
    "pred_tweet": tweet_predictions,
    "true_label": tokenized_datasets["test"]["label"]
})

# 3. Regrouper par ID utilisateur et faire le Vote Majoritaire
user_predictions = df_test.groupby("ID").agg(
    # Le mode() prend la valeur la plus fréquente. [0] prend la première en cas d'égalité
    majority_pred=("pred_tweet", lambda x: x.mode()[0]), 
    true_label=("true_label", "first") # Le vrai label est le même pour tous les tweets de l'ID
)

# 4. Calculer l'accuracy finale au niveau utilisateur
correct_predictions = (user_predictions["majority_pred"] == user_predictions["true_label"]).sum()
total_users = len(user_predictions)
final_accuracy = correct_predictions / total_users

print(f"\\n✅ ÉVALUATION FINALE (Niveau Utilisateur - Vote Majoritaire)")
print(f"Total d'utilisateurs testés : {total_users}")
print(f"Précision (Accuracy) : {final_accuracy * 100:.2f}%")

--- Génération des prédictions sur le jeu de TEST ---


\n✅ ÉVALUATION FINALE (Niveau Utilisateur - Vote Majoritaire)
Total d'utilisateurs testés : 100
Précision (Accuracy) : 69.00%


In [23]:
trainer.save_model("twitter-bert-finetuned-final")
tokenizer.save_pretrained("twitter-bert-finetuned-final")
print("Modèle final sauvegardé avec succès.")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.61it/s]

Modèle final sauvegardé avec succès.



