In [1]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [2]:
!nvidia-smi

Wed Mar  5 12:14:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.124.04             Driver Version: 570.124.04     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX 4000 Ada Gene...    Off |   00000000:01:00.0  On |                  Off |
| 30%   31C    P2             29W /  130W |    1142MiB /  20475MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import os
cache_dir = os.getenv("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
print(cache_dir)
os.chdir("/")
new_cache_dir = "/Data/zakaria.abboud/.cache/huggingface"
if not os.path.exists(new_cache_dir):
    os.makedirs(new_cache_dir, exist_ok=True)

# Set the new cache directory
os.environ["HF_HOME"] = new_cache_dir
print(f"New cache directory: {new_cache_dir}")
print(f"Path from root: {os.path.abspath(new_cache_dir)}")

/users/eleves-a/2022/zakaria.abboud/.cache/huggingface
New cache directory: /Data/zakaria.abboud/.cache/huggingface
Path from root: /Data/zakaria.abboud/.cache/huggingface


In [4]:
import json
import os
from pprint import pprint

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from trl import DPOConfig, DPOTrainer

from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

### Loading the model to fine-tune

In [11]:
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [6]:
def print_trainable_parameters(model):

    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules = ["gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = PeftModel(model, peft_config=config)
print_trainable_parameters(model)

trainable params: 13271040 || all params: 328390528 || trainable%: 4.041237145548851


### Loading the Dataset

In [7]:
train_dataset = json.load(open("/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/data_2k_tokens/train.json"))
test_dataset = json.load(open("/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/data_2k_tokens/test.json"))

train_dataset = Dataset.from_list(train_dataset)
test_dataset = Dataset.from_list(test_dataset)


In [8]:
def generate_prompt(data_point):

    prompt = f"<human>: Summarize the following text :\n\n {data_point['content']}\n\nYou should respond in French, with 5 sentences maximum.\n\n<assistant>:"

    return prompt

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)+tokenizer.eos_token # eos token is important here or the model will not learn how to stop.
    tokenized_full_prompt = tokenizer(full_prompt, return_tensors='pt')
    ## FILL THE GAP: create the labels first by cloning input_ids
    labels = tokenized_full_prompt.input_ids.clone()

    prompt = full_prompt[:full_prompt.find("<assistant>")] + "<assistant>:"
    
    assistant_idx = tokenizer.encode(prompt, return_tensors='pt').shape[1]
    labels[:, :assistant_idx] = -100

    return {
        'input_ids': tokenized_full_prompt.input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': tokenized_full_prompt.attention_mask.flatten(),
    }



### Preprocessing the data for the DPO fine-tuning

In [9]:
def preprocess_data_dpo(data_point):

    system = "You are a helpfull assistant, and you are asked to summarize the following text in French. You should respond with 5 sentences maximum:\n\n"

    return {
            "chosen": [
                {'role': 'user', 'content': system +"\n"+ data_point["content"]},
                {'role': 'assistant', 'content': data_point["chosen"]}
            ],
            "rejected": [
                {'role': 'user', 'content': system +"\n"+ data_point["content"]},
                {'role': 'assistant', 'content': data_point["rejected"]}
            ],
        }

data_dpo = train_dataset.shuffle(seed=42).map(preprocess_data_dpo)
# val_dpo = val_dataset.shuffle(seed=42).map(preprocess_data_dpo)
test_dpo = test_dataset.shuffle(seed=42).map(preprocess_data_dpo)

Map:   0%|          | 0/4589 [00:00<?, ? examples/s]

Map:   0%|          | 0/1967 [00:00<?, ? examples/s]

In [10]:
print(data_dpo)
print(data_dpo[0])

Dataset({
    features: ['title', 'content', 'chosen', 'rejected'],
    num_rows: 4589
})
{'title': 'Córrego do Bom Jesus (Part 1)', 'content': "Córrego do Bom Jesus est une municipalité brésilienne de l'État du Minas Gerais et la microrégion de Pouso Alegre.\n\n\n== Notes et références ==\n\n Portail du Minas Gerais", 'chosen': [{'content': "You are a helpfull assistant, and you are asked to summarize the following text in French. You should respond with 5 sentences maximum:\n\n\nCórrego do Bom Jesus est une municipalité brésilienne de l'État du Minas Gerais et la microrégion de Pouso Alegre.\n\n\n== Notes et références ==\n\n Portail du Minas Gerais", 'role': 'user'}, {'content': "Córrego do Bom Jesus est une municipalité située dans l'État du Minas Gerais au Brésil. Elle fait partie de la microrégion de Pouso Alegre. Pour plus d'informations, consultez le Portail du Minas Gerais. Cette municipalité est donc un territoire administratif et géographique bien définis dans cette région b

In [11]:
tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}<human>: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}<assistant>: {{ message['content'] }}{{ eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}<assistant>: {% endif %}"

messages_example =  [
                      {'role': 'user', 'content': 'what is the capital of France?'},
                      {'role': 'assistant', 'content': 'Paris'}
                    ]

print(tokenizer.apply_chat_template(messages_example, tokenize=False))

<human>: what is the capital of France?
<assistant>: Paris<|im_end|>


In [None]:
OUTPUT_DIR = "/Data/zakaria.abboud/dpo_output_2k_tokens"

training_args = DPOConfig(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    learning_rate=1e-5,
    bf16=True,
    save_total_limit=3,
    logging_steps=20,
    output_dir=OUTPUT_DIR,
    max_steps=200,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="tensorboard",
    beta=0.1,
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    train_dataset=data_dpo,
    tokenizer=tokenizer,
    # Data collator is not needed for DPOTrainer as it internally manages it
)

model.config.use_cache = False
trainer.train()

  trainer = DPOTrainer(


Extracting prompt in train dataset:   0%|          | 0/4589 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/4589 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/4589 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


In [38]:
model

PeftModel(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 896)
        (layers): ModuleList(
          (0-23): 24 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): Linear4bit(in_features=896, out_features=896, bias=True)
              (k_proj): Linear4bit(in_features=896, out_features=128, bias=True)
              (v_proj): Linear4bit(in_features=896, out_features=128, bias=True)
              (o_proj): Linear4bit(in_features=896, out_features=896, bias=False)
            )
            (mlp): Qwen2MLP(
              (gate_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
              (up_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
              (down_proj): Linear4bit(in_features=4864, out_features=896, bias=False)
              (act_fn): SiLU()
            )
            (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
            

### Saving the fine-tuned model

In [40]:
# Save the model weights
merged_model = model.merge_and_unload()
merged_model.save_pretrained("users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/models/2k_tokens")
tokenizer.save_pretrained("users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/models/2k_tokens")
model.config.save_pretrained("users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/models/2k_tokens")
print("Model saved successfully")


Model saved successfully


### Load Model and Computing the new summaries

In [5]:
# Load the model
model = AutoModelForCausalLM.from_pretrained("users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/models/2k_tokens")
tokenizer = AutoTokenizer.from_pretrained("users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/models/2k_tokens")
model

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=896, out_features=896, bias=True)
          (k_proj): Linear4bit(in_features=896, out_features=128, bias=True)
          (v_proj): Linear4bit(in_features=896, out_features=128, bias=True)
          (o_proj): Linear4bit(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear4bit(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (

In [6]:
def summarize_document(document):
    # Tokenize the input document
    prompt = f"<human>: Summarize the following text :\n\n{document}\n\nYou should respond in French, with 5 sentences maximum.\n\n<assistant>:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate summary
    summary_ids = model.generate(**inputs, max_new_tokens=128)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    assistant_index = summary.find("<assistant>:")
    if assistant_index != -1:
        summary = summary[assistant_index + len("<assistant>:"):].strip()

    return summary

In [7]:
articles = json.load(open("/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/summaries/2k_wikipedia_articles.json"))

### Testing the model before computing the summaries

In [None]:
test_text = articles[0]["content"]
print(summarize_document(test_text))

### Computing the summaries

In [8]:
# File to save the summarized articles
# output_file = '/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/summaries/summarized_wikipedia_articles.json'
output_file = '/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/summaries/2k_finetuned_dpo_summarized_wikipedia_articles.json'

# Function to save results progressively
def save_progress(summarized_articles, output_file):
    if os.path.exists(output_file):
        with open(output_file, 'r', encoding='utf-8') as f:
            existing_data = json.load(f)
    else:
        existing_data = []

    existing_data.extend(summarized_articles)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, ensure_ascii=False, indent=4)

# Summarize articles in parallel
summarized_articles = []
with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your system
    futures = {executor.submit(summarize_document, article['content']): article for article in articles}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Summarizing articles"):
        result = future.result()
        summarized_articles.append({
            'title': futures[future]['title'],
            'content': futures[future]['content'],
            'summary': result
        })

        # Save progress every 10 summaries
        if len(summarized_articles) % 10 == 0:
            save_progress(summarized_articles, output_file)
            summarized_articles = []  # Clear the list after saving

# Save any remaining summaries
if summarized_articles:
    save_progress(summarized_articles, output_file)

print(f"Summarized articles saved to '{output_file}'")

Summarizing articles:   0%|          | 0/6498 [00:00<?, ?it/s]

Summarizing articles: 100%|██████████| 6498/6498 [3:05:53<00:00,  1.72s/it]  


Summarized articles saved to '/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/summaries/2k_finetuned_dpo_summarized_wikipedia_articles.json'
