In [1]:
!pip install -q -U accelerate datasets peft transformers trl wandb bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.6/336.6 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m97.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.9/313.9 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [2]:

from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

# Specify the checkpoint for SmolLM2 and set the device.
checkpoint = "HuggingFaceTB/SmolLM2-135M-Instruct"

# Load the tokenizer and model.
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# For multi-GPU setups, consider using device_map="auto":
model = AutoModelForCausalLM.from_pretrained(
        checkpoint,
        device_map="auto",  # {"": PartialState().process_index}
        )

tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [None]:
tokenizer

In [None]:
model

# Dataset

Json structure output: https://huggingface.co/datasets/ChristianAzinn/json-training

In [22]:
from datasets import load_dataset

dataset_name = "Khmarigou/Begue"
ds = load_dataset("Khmarigou/alpace_begue_fr")
# Perform Train-Test Split
split_ds = ds["train"].train_test_split(test_size=0.2, seed=42)

# Access train and test splits
train_dataset = split_ds["train"].select(range(50))
test_dataset = split_ds["test"].select(range(1000))

In [20]:
train_dataset
test_dataset

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 1000
})

In [4]:
# Set the response template to match the chat format.
# (Ensure this string exactly matches the beginning of the assistant's response as output by apply_chat_template.)
response_template = "<|im_start|>assistant\n"
instruction_template = "<|im_start|>user\n"
PROMPT_TEMPLATE = """Query: {query}

schema:
{schema}"""


def formatting_prompts_func(example):
    """
    Converts each example into a conversation string using the tokenizer's chat template.
    Assumes each example contains lists under "instruction" and "output".
    """
    output_texts = []
    for i in range(len(example["instruction"])):
        # Build a conversation with a user message and an assistant reply.
        messages = [
            {
                "role":    "system",
                "content": "You are a person who stutter."
                },
            {"role": "user", "content": example['instruction'][i]},
            # Note: It is important that the assistant message content here does not
            # include the assistant marker, because the chat template will insert it.
            {"role": "assistant", "content": example["output"][i]}
            ]
        # Use the chat template to generate the formatted text.
        text = tokenizer.apply_chat_template(messages, tokenize=False)
        output_texts.append(text)
    return output_texts


# Create the data collator.
# It will search for the response_template (here "Assistant:") in the formatted text
# and ensure that only tokens after that marker contribute to the loss.
collator = DataCollatorForCompletionOnlyLM(response_template=response_template,
                                           instruction_template=instruction_template,
                                           tokenizer=tokenizer,
                                           mlm=False)



In [5]:
tokenizer.apply_chat_template([
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello, how are you?"},
    {"role": "assistant", "content": "I am good, thank you."}
    ], tokenize=False)

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI am good, thank you.<|im_end|>\n'

# Lora Config

In [5]:
from peft import LoraConfig

# Note that r, in the figure above, is a hyperparameter here that we can use to specify the rank of the low-rank matrices used for adaptation.
# A smaller r leads to a simpler low-rank matrix, which results in fewer parameters to learn during adaptation.
# This can lead to faster training and potentially reduced computational requirements.
# However, with a smaller r, the capacity of the low-rank matrix to capture task-specific information decreases.
# This may result in lower adaptation quality, and the model might not perform as well on the new task compared to a higher r.
lora_config = LoraConfig(
        r=32,
        lora_alpha=64,
        lora_dropout=0.05,
        target_modules=['o_proj', 'k_proj', 'q_proj', "v_proj"],
        bias="none",
        task_type="CAUSAL_LM",
        )

# Wandb

Creat token and account: https://wandb.ai/home

In [6]:
import wandb
import getpass

token = getpass.getpass()
wandb.login(key=token)

 ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33memilien-boitouzet-cours[0m ([33memilien-boitouzet-cours-usmb[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
hub_model_id = "Khmarigou/Begue"

# SFT Trainer config

In [23]:
OUTPUT_DIR = checkpoint.split("/")[-1] + "-structure-output"

# setup the trainer
trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        args=SFTConfig(
                per_device_train_batch_size=2,
                gradient_accumulation_steps=4,
                warmup_steps=100,
                num_train_epochs=3,
                learning_rate=0.00002,
                lr_scheduler_type="cosine",
                eval_strategy="steps",
                eval_steps=500,
                weight_decay=0.01,
                bf16=True,
                logging_strategy="steps",
                logging_steps=10,
                output_dir="./" + OUTPUT_DIR,
                optim="paged_adamw_8bit",
                seed=42,
                run_name=f"train-{OUTPUT_DIR}",
                report_to="wandb",
                save_steps=31,
                push_to_hub=True,
                hub_model_id=hub_model_id,
                save_total_limit=4
                ),
        peft_config=lora_config,
        formatting_func=formatting_prompts_func,
        data_collator=collator,
        )

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [24]:
import os
from transformers import is_torch_xpu_available, is_torch_npu_available
import torch

# Lancement du processus d'entraînement du modèle.
# Ici, 'trainer.train()' déclenche la phase de fine-tuning,
# dans laquelle les paramètres du modèle sont ajustés sur une tâche spécifique
# en utilisant des données d'entraînement pertinentes.
trainer.train()

# Une fois l'entraînement terminé, on sauvegarde l'adaptateur LoRA (fine-tuning léger).
# LoRA (Low-Rank Adaptation) est une technique destinée à fine-tuner les grands
# modèles en modifiant uniquement un sous-ensemble restreint de paramètres.
final_checkpoint_dir = os.path.join(OUTPUT_DIR, "final_checkpoint")
trainer.save_model(final_checkpoint_dir)

trainer.push_to_hub(dataset_name=dataset_name)

# Nettoyage des ressources mémoire pour libérer l'espace GPU ou autres accélérateurs,
# ce qui est utile avant de fusionner l'adaptateur LoRA avec le modèle de base.
del model  # Suppression explicite du modèle de la mémoire.

# Vider les caches des accélérateurs (XPU, NPU ou GPU en fonction de la disponibilité).
# Cela optimise l'utilisation future des ressources.
if is_torch_xpu_available():
    torch.xpu.empty_cache()  # Vide les caches spécifiques pour XPU.
elif is_torch_npu_available():
    torch.npu.empty_cache()  # Vide les caches spécifiques pour NPU.
else:
    torch.cuda.empty_cache()  # Vide les caches GPU standard.

# Chargement du modèle adapté (en incluant l'adaptateur LoRA) pour effectuer une fusion
# avec le modèle de base. Cela permet de sauvegarder un modèle autonome optimisé.
from peft import AutoPeftModelForCausalLM

# Chargement du modèle préalablement sauvegardé depuis le répertoire OUTPUT_DIR.
# Les paramètres 'device_map' et 'torch_dtype' permettent d'optimiser le chargement :
# - 'device_map="auto"' ajuste automatiquement le placement sur le GPU, CPU ou autre.
# - 'torch_dtype=torch.bfloat16' utilise un format numérique bfloat16, qui réduit
#    la mémoire nécessaire tout en maintenant des performances stables.
model = AutoPeftModelForCausalLM.from_pretrained(
        OUTPUT_DIR,
        device_map="auto",
        torch_dtype=torch.bfloat16
        )

# Fusion de l'adaptateur LoRA directement dans le modèle de base,
# afin de produire un modèle final unique tout en réduisant ses redondances.
model = model.merge_and_unload()

# Sauvegarde du modèle fusionné dans un répertoire spécifique.
# 'safe_serialization=True' garantit que le modèle est stocké au format sûr,
# pour une compatibilité future et une intégrité des données.
output_merged_dir = os.path.join(OUTPUT_DIR, "final_merged_checkpoint")
model.save_pretrained(output_merged_dir, safe_serialization=True)

Step,Training Loss,Validation Loss




In [11]:
model.push_to_hub(hub_model_id)

README.md:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Khmarigou/Begue/commit/8096f0a7ecd9bb1999b0a7c5072c5f342b0c2f9d', commit_message='Upload LlamaForCausalLM', commit_description='', oid='8096f0a7ecd9bb1999b0a7c5072c5f342b0c2f9d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Khmarigou/Begue', endpoint='https://huggingface.co', repo_type='model', repo_id='Khmarigou/Begue'), pr_revision=None, pr_num=None)

# inference

In [25]:
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"  # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR).to(device)

In [15]:
messages = [
            {
                "role":    "system",
                "content": "You are a person who stutter."
                },
    {
        "role":    "user",
        "content": "Tell me a story."
        },
    ]

In [26]:
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
print(input_text)
print("----------------- Generated text -----------------")
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs,
                         max_new_tokens=1000,
                         temperature=0.2,
                         top_p=0.45,
                         eos_token_id=tokenizer.eos_token_id,   # <-- crucial
                         pad_token_id=tokenizer.eos_token_id,    # often set pad = eos
                         do_sample=True)
print(tokenizer.decode(outputs[0]))

<|im_start|>system
You are a person who stutter.<|im_end|>
<|im_start|>user
Tell me a story.<|im_end|>

----------------- Generated text -----------------
<|im_start|>system
You are a person who stutter.<|im_end|>
<|im_start|>user
Tell me a story.<|im_end|>
<|im_start|>assistant
I've got a story brewing. It's a tale of a young woman named Maya, who's been living in a small, isolated town on the outskirts of the city. She's a talented artist, but her work is often dismissed by the locals as "talentless" or "unrealistic." Despite her talents, she's been struggling to make ends meet, and the town's poverty is taking a toll on her mental health.

One day, a mysterious stranger arrives in town, dressed in a long coat and carrying a small, ornate box. The stranger introduces himself as "The Architect," and he's a master of the art of architecture. He's been working on a project for years, but he's been unable to complete it due to a series of unforeseen circumstances.

The Architect arrives 