In [1]:
import os

hf = open("env.txt", "r", encoding="utf-8").read()

os.environ["HF_TOKEN"] = hf
os.environ["HF_USERNAME"] = "smarcq"

FileNotFoundError: [Errno 2] No such file or directory: 'env.txt'

In [None]:
# install the libraries
%pip install -U transformers 
%pip install -U accelerate 
%pip install -U datasets
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 

Collecting transformers
  Downloading transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.9.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tq

In [70]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
print(torch.cuda.memory_allocated())

True
NVIDIA A100-SXM4-40GB
25430362112


In [71]:
# del model  # reset model
# torch.cuda.empty_cache()

In [72]:
# import the modules
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [73]:
# Build dataset

In [74]:
import pandas as pd
df = pd.read_csv("20250708_filtered.csv")
df.head()

Unnamed: 0,id,label,reference_1,reference_2
0,287056,Distinct records,"{'book': None, 'page': 'None-None', 'issue': {...","{'book': None, 'page': None, 'issue': {'date':..."
1,287057,Distinct records,"{'book': None, 'page': '94 - 102', 'issue': {'...","{'book': None, 'page': None, 'issue': None, 'i..."
2,287059,Duplicates or equivalents,"{'book': None, 'page': '653-658', 'issue': Non...","{'book': None, 'page': '653-658', 'issue': {'d..."
3,287060,Duplicates or equivalents,"{'book': None, 'page': None, 'issue': None, 'i...","{'book': None, 'page': 'None-None', 'issue': N..."
4,287062,Distinct records,"{'book': {'title': 'Virtual retrospect', 'isbn...","{'book': None, 'page': '11-17', 'issue': None,..."


In [75]:
# make the df into a 5500 record dataset
from datasets import Dataset

dataset = Dataset.from_pandas(df)

# shuffle and select 5500 lines
dataset = dataset.shuffle(seed=85).select(range(5500))


In [76]:
# format prompts
def format_chat_template(row):
    user_content = f"""
You are an expert in detecting duplicate research publications. Below are the metadata (in JSON) for two records collected from research data platforms. Your task is to analyze and classify them as one of:
- "Duplicates or equivalents" — if they represent the same publication.
- "Distinct records" — if they are two different records.
- "Insufficient information" — if there is not enough evidence to decide.
Do not explain your answer, only give one of the three labels listed above.

Records:

Record 1 metadata : {row['reference_1']}

___________________________________________________

Record 2 metadata : {row['reference_2']}

Classification (Duplicates or equivalents / Distinct records / Insufficient information): 
""".strip()

    if "label" in row and row["label"] is not None:
        assistant_content = row["label"]
    else:
        assistant_content = ""

    row["messages"] = [
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": assistant_content}
    ]
    return row

dataset = dataset.map(format_chat_template)


Map:   0%|          | 0/5500 [00:00<?, ? examples/s]

In [77]:
# Split to train/test (test_size=0.2) --
dataset_split = dataset.train_test_split(test_size=0.2, seed=42)

# split test to eval/test -> 80/10/10
eval_test_split = dataset_split["test"].train_test_split(test_size=0.5, seed=42)

final_splits = {
    "train": dataset_split["train"],
    "eval": eval_test_split["train"],
    "test": eval_test_split["test"]
}

# print firtst prompt
print(final_splits["train"]["messages"][0]) 

[{'content': 'You are an expert in detecting duplicate research publications. Below are the metadata (in JSON) for two records collected from research data platforms. Your task is to analyze and classify them as one of:\n- "Duplicates or equivalents" — if they represent the same publication.\n- "Distinct records" — if they are two different records.\n- "Insufficient information" — if there is not enough evidence to decide.\nDo not explain your answer, only give one of the three labels listed above.\n\nRecords:\n\nRecord 1 metadata : {\'book\': None, \'page\': \'2370-2377\', \'issue\': {\'date\': None, \'number\': [], \'rights\': None, \'volume\': \'17\', \'journal\': {\'issn\': [\'1068-9265\'], \'eissn\': [\'1534-4681\'], \'titles\': [\'Annals of Surgical Oncology\'], \'publisher\': \'Springer Verlag\'}}, \'issued\': \'2010-01-01 00:00:00\', \'titles\': [{\'value\': \'Peritoneal Carcinomatosis from Gastric Cancer: A Multi-Institutional Study of 159 Patients Treated by Cytoreductive Sur

In [78]:
eval_test_split.save_to_disk("dataset_eval_test")
dataset_split.save_to_disk("dataset_train")

Saving the dataset (0/1 shards):   0%|          | 0/550 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/550 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1100 [00:00<?, ? examples/s]

In [79]:
# print label distribution
from collections import Counter

def display_label_distribution(splits_dict, title):
    print(f"\n🔎 {title}")
    for split_name, split_data in splits_dict.items():
        label_counts = Counter(split_data["label"])
        print(f"\nLabel distribution in {split_name} set:")
        total = sum(label_counts.values())
        for label, count in label_counts.items():
            pct = (count / total) * 100 if total > 0 else 0
            print(f"  {label}: {count} ({pct:.2f}%)")

display_label_distribution(final_splits, "Label distribution")





🔎 Label distribution BEFORE filtering

Label distribution in train set:
  Distinct records: 3299 (74.98%)
  Duplicates or equivalents: 639 (14.52%)
  Insufficient information: 462 (10.50%)

Label distribution in eval set:
  Distinct records: 396 (72.00%)
  Insufficient information: 66 (12.00%)
  Duplicates or equivalents: 88 (16.00%)

Label distribution in test set:
  Distinct records: 412 (74.91%)
  Insufficient information: 58 (10.55%)
  Duplicates or equivalents: 80 (14.55%)


In [80]:
# Model

In [81]:
# load the model
base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
output_dir="./llama-3-crisalid-drd"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True, # quantization 8 bits
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="bfloat16",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [82]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

In [83]:
import numpy as np

def count_tokens(messages):
    full_prompt = ""
    for message in messages:
        full_prompt += message["role"] + ": " + message["content"] + "\n"
    tokens = tokenizer.tokenize(full_prompt)
    return len(tokens)

for split_name, split_data in final_splits.items():
    token_lengths = [count_tokens(example["messages"]) for example in split_data]
    print(f"\n📊 Stats for split: {split_name}")
    print(f"Min tokens: {np.min(token_lengths)}")
    print(f"Max tokens: {np.max(token_lengths)}")
    print(f"Mean tokens: {np.mean(token_lengths):.2f}")


📊 Stats for split: train
Min tokens: 611
Max tokens: 18852
Mean tokens: 3069.54

📊 Stats for split: eval
Min tokens: 729
Max tokens: 17528
Mean tokens: 3292.44

📊 Stats for split: test
Min tokens: 647
Max tokens: 17389
Mean tokens: 3092.37


In [84]:
# Fune tuning config
peft_config = LoraConfig(
    r=64,
    lora_alpha=128,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    num_train_epochs=2,
    eval_steps=0.5,
    logging_steps=10,
    warmup_steps=5,
    logging_strategy="steps",
    learning_rate=3e-4,
    fp16=False,
    bf16=True,
    group_by_length=True,
    label_names=["Duplicates or equivalents", "Distinct records", "Insufficient information"],
    logging_dir="./logs"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=final_splits["train"],
    eval_dataset=final_splits["eval"],
    peft_config=peft_config,
    args=training_arguments,
)

Tokenizing train dataset:   0%|          | 0/4400 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/4400 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/550 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/550 [00:00<?, ? examples/s]

In [None]:
trainer.train()



Step,Training Loss
10,0.9012
20,0.5587
30,0.5049
40,0.4947
50,0.351
60,0.4761
70,0.4468
80,0.4532
90,0.4617
100,0.3172


In [None]:
# Save trained model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
trainer.state.log_history

In [None]:
# evaluation
results = trainer.evaluate()
print(results)

In [None]:
!pip install tensorboard

In [None]:
%load_ext tensorboard
%tensorboard --logdir=./logs

In [None]:
# to do : regarder quand se créent les backslash