python -m venv venv_hackeurope
python -m ipykernel install --user --name=venv_hackeurope --display-name "Python (hackeurope)"

In [None]:
# !pip install --upgrade pip

# !pip install datasets evaluate transformers accelerate
# !pip install torch
# !pip install transformers[torch]
# !pip install scikit-learn
# !pip install matplotlib



# Processing the data (PyTorch)

In [2]:
from datasets import load_dataset

raw_datasets = load_dataset(
    "json", 
    data_files={
        "train": "train.json", 
        "validation": "valid.json", 
        "test": "test.json"
    }
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def preprocess_data(example):
    tweets = example.get('tweet')
    
    # 1. Safely handle missing, null, or improperly formatted tweets
    if tweets is None:
        combined_tweets = ""
    elif isinstance(tweets, str):
        # If it's already a single string, just use it
        combined_tweets = tweets
    elif isinstance(tweets, list):
        # If it's a list, ensure all items are strings before joining
        combined_tweets = " ".join([str(t) for t in tweets if t is not None])
    else:
        # Fallback for any other weird data types
        combined_tweets = str(tweets)
    
    # 2. Safely handle the label (defaulting to 0 if missing)
    raw_label = example.get('label', 0)
    integer_label = int(raw_label) if raw_label is not None else 0
    
    return {
        'text': combined_tweets,
        'label': integer_label
    }

# Apply the preprocessing
processed_datasets = raw_datasets.map(preprocess_data)

print(processed_datasets)

DatasetDict({
    train: Dataset({
        features: ['ID', 'profile', 'tweet', 'neighbor', 'domain', 'label', 'text'],
        num_rows: 8278
    })
    validation: Dataset({
        features: ['ID', 'profile', 'tweet', 'neighbor', 'domain', 'label', 'text'],
        num_rows: 2365
    })
    test: Dataset({
        features: ['ID', 'profile', 'tweet', 'neighbor', 'domain', 'label', 'text'],
        num_rows: 1183
    })
})


In [4]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    # Tokenize the aggregated text, enforcing truncation to BERT's maximum length
    return tokenizer(example["text"], truncation=True, max_length=512)

tokenized_datasets = processed_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 2365/2365 [00:07<00:00, 319.24 examples/s]


# FINE TUNING

In [5]:
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback

metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="twitter-bert-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

# Update num_labels to match your new dataset (e.g., 2 for binary classification)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# Start training
trainer.train()

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 507.27it/s, Materializing param=bert.pooler.dense.weight]                               
[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- M

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.730288,1.706298,0.240654
2,1.637833,1.68507,0.271807
3,1.526402,1.760041,0.266355


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.46it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.40it/s]
There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerN

TrainOutput(global_step=1926, training_loss=1.5817555460231698, metrics={'train_runtime': 321.3653, 'train_samples_per_second': 95.863, 'train_steps_per_second': 5.993, 'total_flos': 701483891047236.0, 'train_loss': 1.5817555460231698, 'epoch': 3.0})

In [None]:
print("--- Évaluation finale sur le jeu de TEST ---")

test_results = trainer.evaluate(tokenized_datasets["test"])
print(test_results)

--- Évaluation finale sur le jeu de TEST ---


{'eval_loss': 1.657340168952942, 'eval_accuracy': 0.2868277474668745, 'eval_runtime': 3.8723, 'eval_samples_per_second': 331.323, 'eval_steps_per_second': 20.918, 'epoch': 3.0}


In [None]:
trainer.save_model("liar-bert-finetuned-final")
tokenizer.save_pretrained("liar-bert-finetuned-final")
print("Modèle final sauvegardé avec succès.")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]

Modèle final sauvegardé avec succès.



