In [1]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [2]:
import os
cache_dir = os.getenv("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
print(cache_dir)
os.chdir("/")
new_cache_dir = "/Data/zakaria.abboud/.cache/huggingface"
if not os.path.exists(new_cache_dir):
    os.makedirs(new_cache_dir, exist_ok=True)

# Set the new cache directory
os.environ["HF_HOME"] = new_cache_dir
print(f"New cache directory: {new_cache_dir}")
print(f"Path from root: {os.path.abspath(new_cache_dir)}")

/users/eleves-a/2022/zakaria.abboud/.cache/huggingface
New cache directory: /Data/zakaria.abboud/.cache/huggingface
Path from root: /Data/zakaria.abboud/.cache/huggingface


In [3]:
import json
import os

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from trl import DPOConfig, DPOTrainer

from peft import (
    LoraConfig,
    PeftModel,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

### Loading the model to fine-tune

In [5]:
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [6]:
def print_trainable_parameters(model):

    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules = ["gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = PeftModel(model, peft_config=config)
print_trainable_parameters(model)

trainable params: 13271040 || all params: 328390528 || trainable%: 4.041237145548851


### Loading the Dataset

In [None]:
token_size = "2k" # or 8k

In [7]:
train_dataset = json.load(open(f"/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/data_{token_size}_tokens/train.json"))
test_dataset = json.load(open(f"/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/data_{token_size}_tokens/test.json"))

train_dataset = Dataset.from_list(train_dataset)
test_dataset = Dataset.from_list(test_dataset)

### Preprocessing the data for the DPO fine-tuning

In [9]:
def preprocess_data_dpo(data_point):

    system = "You are a helpfull assistant, and you are asked to summarize the following text in French. You should respond with 5 sentences maximum:\n\n"

    return {
            "chosen": [
                {'role': 'user', 'content': system +"\n"+ data_point["content"]},
                {'role': 'assistant', 'content': data_point["chosen"]}
            ],
            "rejected": [
                {'role': 'user', 'content': system +"\n"+ data_point["content"]},
                {'role': 'assistant', 'content': data_point["rejected"]}
            ],
        }

data_dpo = train_dataset.shuffle(seed=42).map(preprocess_data_dpo)
test_dpo = test_dataset.shuffle(seed=42).map(preprocess_data_dpo)

Map:   0%|          | 0/4589 [00:00<?, ? examples/s]

Map: 100%|██████████| 4589/4589 [00:00<00:00, 15391.52 examples/s]
Map: 100%|██████████| 1967/1967 [00:00<00:00, 16166.24 examples/s]


In [11]:
print(data_dpo)
print(data_dpo[0]["chosen"])

Dataset({
    features: ['title', 'content', 'chosen', 'rejected'],
    num_rows: 4589
})
[{'content': "You are a helpfull assistant, and you are asked to summarize the following text in French. You should respond with 5 sentences maximum:\n\n\nCórrego do Bom Jesus est une municipalité brésilienne de l'État du Minas Gerais et la microrégion de Pouso Alegre.\n\n\n== Notes et références ==\n\n Portail du Minas Gerais", 'role': 'user'}, {'content': "Córrego do Bom Jesus est une municipalité située dans l'État du Minas Gerais au Brésil. Elle fait partie de la microrégion de Pouso Alegre. Pour plus d'informations, consultez le Portail du Minas Gerais. Cette municipalité est donc un territoire administratif et géographique bien définis dans cette région brésilienne. <sentence-break>", 'role': 'assistant'}]


In [13]:
tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}<human>: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}<assistant>: {{ message['content'] }}{{ eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}<assistant>: {% endif %}"

messages_example =  data_dpo[0]["rejected"]

print(tokenizer.apply_chat_template(messages_example, tokenize=False))

<human>: You are a helpfull assistant, and you are asked to summarize the following text in French. You should respond with 5 sentences maximum:


Córrego do Bom Jesus est une municipalité brésilienne de l'État du Minas Gerais et la microrégion de Pouso Alegre.


== Notes et références ==

 Portail du Minas Gerais
<assistant>: Ce terme d'île ou de île en général désigne un petit bateau qui a été déplacé pour y survivre. Dans le cas de l'île Corrêa, elle était principalement utilisée par les peuples indigènes au fil des siècles. L'île est maintenant considérée comme une propriété privée. Elle est située dans l'est du Minas Gerais, à environ 180 km au nord-est de Lisbonne. 

<assistant>: Córrego do Bom Jesus est une municipalité brésilienne située dans l<|im_end|>


In [18]:
OUTPUT_DIR = "/Data/zakaria.abboud/dpo_output_2k_tokens"

training_args = DPOConfig(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32,
    num_train_epochs=1,
    learning_rate=1e-5,
    bf16=True,
    save_total_limit=3,
    logging_steps=20,
    output_dir=OUTPUT_DIR,
    max_steps=100,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="tensorboard",
    beta=0.1,
    label_names=["chosen", "rejected"],
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    train_dataset=data_dpo,
    tokenizer=tokenizer,
    # Data collator is not needed for DPOTrainer as it internally manages it
)

model.config.use_cache = False
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = DPOTrainer(
Extracting prompt in train dataset: 100%|██████████| 4589/4589 [00:00<00:00, 9136.60 examples/s]
Applying chat template to train dataset: 100%|██████████| 4589/4589 [00:00<00:00, 7383.34 examples/s]
Tokenizing train dataset: 100%|██████████| 4589/4589 [00:05<00:00, 768.07 examples/s]


Step,Training Loss
20,0.0143
40,0.0085
60,0.0079
80,0.0054
100,0.0034


TrainOutput(global_step=100, training_loss=0.007930982336401939, metrics={'train_runtime': 1058.6599, 'train_samples_per_second': 3.023, 'train_steps_per_second': 0.094, 'total_flos': 0.0, 'train_loss': 0.007930982336401939, 'epoch': 0.6973196774896492})

In [38]:
model

PeftModel(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 896)
        (layers): ModuleList(
          (0-23): 24 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): Linear4bit(in_features=896, out_features=896, bias=True)
              (k_proj): Linear4bit(in_features=896, out_features=128, bias=True)
              (v_proj): Linear4bit(in_features=896, out_features=128, bias=True)
              (o_proj): Linear4bit(in_features=896, out_features=896, bias=False)
            )
            (mlp): Qwen2MLP(
              (gate_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
              (up_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
              (down_proj): Linear4bit(in_features=4864, out_features=896, bias=False)
              (act_fn): SiLU()
            )
            (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
            

### Saving the fine-tuned model

In [19]:
# Save the model weights
merged_model = model.merge_and_unload()
merged_model.save_pretrained("users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/models/2k_tokens_finetuned")
tokenizer.save_pretrained("users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/models/2k_tokens_finetuned")
# model.config.save_pretrained("users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/models/2k_tokens")
print("Model saved successfully")




Model saved successfully
