In [1]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [2]:
import os
cache_dir = os.getenv("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
print(cache_dir)
os.chdir("/")
new_cache_dir = "/Data/zakaria.abboud/.cache/huggingface"
if not os.path.exists(new_cache_dir):
    os.makedirs(new_cache_dir, exist_ok=True)

# Set the new cache directory
os.environ["HF_HOME"] = new_cache_dir
print(f"New cache directory: {new_cache_dir}")
print(f"Path from root: {os.path.abspath(new_cache_dir)}")

/users/eleves-a/2022/zakaria.abboud/.cache/huggingface
New cache directory: /Data/zakaria.abboud/.cache/huggingface
Path from root: /Data/zakaria.abboud/.cache/huggingface


In [38]:
import json
import os
from pprint import pprint

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from trl import DPOConfig, DPOTrainer

from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [4]:
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
# MODEL_NAME = "unsloth/Llama-3.2-1B" # Try Llama if you want

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [6]:
def print_trainable_parameters(model):

    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules = ["gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = PeftModel(model, peft_config=config)
print_trainable_parameters(model)

trainable params: 13271040 || all params: 328390528 || trainable%: 4.041237145548851


In [20]:
train_dataset = json.load(open("/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/data/train.json"))
val_dataset = json.load(open("/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/data/val.json"))
test_dataset = json.load(open("/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/data/test.json"))

train_dataset = Dataset.from_list(train_dataset)
val_dataset = Dataset.from_list(val_dataset)
test_dataset = Dataset.from_list(test_dataset)

train_dataset


Dataset({
    features: ['title', 'content', 'chosen', 'rejected'],
    num_rows: 3938
})

In [None]:
def generate_prompt(data_point):

    prompt = f"<human>: Summarize the following text :\n\n {data_point['content']}\n\nYou should respond in French, with 5 sentences maximum.\n\n<assistant>:"

    return prompt

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)+tokenizer.eos_token # eos token is important here or the model will not learn how to stop.
    tokenized_full_prompt = tokenizer(full_prompt, return_tensors='pt')
    ## FILL THE GAP: create the labels first by cloning input_ids
    labels = tokenized_full_prompt.input_ids.clone()

    prompt = full_prompt[:full_prompt.find("<assistant>")] + "<assistant>:"
    end_prompt_idx = prompt.find("<assistant>:")

    labels[:, :end_prompt_idx] = -100

    return {
        'input_ids': tokenized_full_prompt.input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': tokenized_full_prompt.attention_mask.flatten(),
    }



Map:   0%|          | 0/3938 [00:00<?, ? examples/s]

In [30]:
def preprocess_data_dpo(data_point):

    system = "You are a helpfull assistant, and you are asked to summarize the following text in French. You should respond with 5 sentences maximum:\n\n"

    return {
            "chosen": [
                {'role': 'user', 'content': system +"\n"+ data_point["content"]},
                {'role': 'assistant', 'content': data_point["chosen"]}
            ],
            "rejected": [
                {'role': 'user', 'content': system +"\n"+ data_point["content"]},
                {'role': 'assistant', 'content': data_point["rejected"]}
            ],
        }

data_dpo = train_dataset.shuffle(seed=42).map(preprocess_data_dpo)
val_dpo = val_dataset.shuffle(seed=42).map(preprocess_data_dpo)
test_dpo = test_dataset.shuffle(seed=42).map(preprocess_data_dpo)

Map:   0%|          | 0/3938 [00:00<?, ? examples/s]

Map:   0%|          | 0/493 [00:00<?, ? examples/s]

Map:   0%|          | 0/492 [00:00<?, ? examples/s]

In [24]:
print(data_dpo)
print(data_dpo[0])

Dataset({
    features: ['title', 'content', 'chosen', 'rejected'],
    num_rows: 3938
})
{'title': 'Bouctouche (Part 2)', 'content': "é Michaud (Bouctouche, 1912 - ?, 1978), agriculteur, marchand et homme politique;\nJoseph Michaud (1841-1903), prêtre, mort à Bouctouche ;\nMarguerite Michaud (Bouctouche, 1903 - 1982), enseignante, administratrice, conférencière et écrivaine;\nAuguste Renaud (Bordeaux, 1835 - Bouctouche, 1897), agriculteur et homme politique.\nPaul Dwayne (Bouctouche, 1964 - 2024), chanteur de musique country, né à Bouctouche.\nL’Hon. Stephen J. Doucet (Bouctouche, 1967-?), Juge de la Cour du Banc du Roi du Nouveau-Brunswick.\n\n\n=== Architecture et monuments ===\n\nBouctouche possède une architecture variée mais dominée par le style traditionnel acadien. Toutefois, l'usine Fantech possède l'un des meilleurs exemples de mur-rideau de la province.\nL'ancien bureau de poste est situé au 59, boulevard Irving. C'est un édifice en briques de deux étages. Il fut conçu et co

In [27]:
tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}<human>: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}<assistant>: {{ message['content'] }}{{ eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}<assistant>: {% endif %}"

messages_example =  [
                      {'role': 'user', 'content': 'what is the capital of France?'},
                      {'role': 'assistant', 'content': 'Paris'}
                    ]

print(tokenizer.apply_chat_template(messages_example, tokenize=False))

<human>: what is the capital of France?
<assistant>: Paris<|im_end|>


In [32]:
OUTPUT_DIR = "/Data/zakaria.abboud/dpo_output"

training_args = DPOConfig(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    learning_rate=1e-5,
    bf16=True,
    save_total_limit=3,
    logging_steps=20,
    output_dir=OUTPUT_DIR,
    max_steps=200,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="tensorboard",
    beta=0.1,
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    train_dataset=data_dpo,
    tokenizer=tokenizer,
    # Data collator is not needed for DPOTrainer as it internally manages it
)

model.config.use_cache = False
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = DPOTrainer(


Extracting prompt in train dataset:   0%|          | 0/3938 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/3938 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3938 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
20,0.5132
40,0.1768
60,0.1013
80,0.0467
100,0.0408
120,0.0279
140,0.0271
160,0.0223
180,0.0224
200,0.0154


TrainOutput(global_step=200, training_loss=0.09938076943159103, metrics={'train_runtime': 880.3569, 'train_samples_per_second': 3.635, 'train_steps_per_second': 0.227, 'total_flos': 0.0, 'train_loss': 0.09938076943159103, 'epoch': 0.8125952260030472})

In [34]:
def summarize_document(document):
    # Tokenize the input document
    prompt = f"<human>: Summarize the following text :\n\n{document}\n\nYou should respond in French, with 5 sentences maximum.\n\n<assistant>:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate summary
    summary_ids = model.generate(**inputs, max_new_tokens=128)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    assistant_index = summary.find("<assistant>:")
    if assistant_index != -1:
        summary = summary[assistant_index + len("<assistant>:"):].strip()

    return summary

In [37]:
articles = json.load(open("/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/summaries/wikipedia_articles.json"))

4923

In [None]:
# File to save the summarized articles
# output_file = '/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/summaries/summarized_wikipedia_articles.json'
output_file = '/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/summaries/finetuned_summarized_wikipedia_articles.json'

# Function to save results progressively
def save_progress(summarized_articles, output_file):
    if os.path.exists(output_file):
        with open(output_file, 'r', encoding='utf-8') as f:
            existing_data = json.load(f)
    else:
        existing_data = []

    existing_data.extend(summarized_articles)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, ensure_ascii=False, indent=4)

# Summarize articles in parallel
summarized_articles = []
with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your system
    futures = {executor.submit(summarize_document, article['content']): article for article in articles}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Summarizing articles"):
        result = future.result()
        summarized_articles.append({
            'title': futures[future]['title'],
            'content': futures[future]['content'],
            'summary': result
        })

        # Save progress every 10 summaries
        if len(summarized_articles) % 10 == 0:
            save_progress(summarized_articles, output_file)
            summarized_articles = []  # Clear the list after saving

# Save any remaining summaries
if summarized_articles:
    save_progress(summarized_articles, output_file)

print(f"Summarized articles saved to '{output_file}'")

Summarizing articles:   0%|          | 21/4923 [00:52<2:30:15,  1.84s/it]