In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd '/content/drive/MyDrive'

In [None]:
!pip install -q torch
!pip install -q transformers
!pip install -q bitsandbytes
!pip install -q peft
!pip install -q accelerate
!pip install -q datasets
!pip install -q trl

In [None]:
import os
import torch
from transformers import (
    AutoModel,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig
from datasets import load_dataset
from trl import SFTTrainer



In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

if compute_dtype == torch.float16 and True:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
        print("=" * 80)

model = AutoModelForCausalLM.from_pretrained(
    "mlabonne/NeuralHermes-2.5-Mistral-7B",
    device_map={"": 0},
    quantization_config=bnb_config
)

model.config.use_cache = False
model.config.pretraining_tp = 1

Your GPU supports bfloat16, you can accelerate training with the argument --bf16


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    inference_mode=False,
    task_type="CAUSAL_LM",
    target_modules = ["q_proj", "v_proj"] #There are options to deepen the finetuning by unfreezing more weights but with a cost in performance
)

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("mlabonne/NeuralHermes-2.5-Mistral-7B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


**préparation des données**

In [None]:
from random import randrange
# Chargement du dataset.
# dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
data_files = {"train": "/content/drive/MyDrive/resultats_tableau_final.json"}
dataset = load_dataset("json", data_files=data_files, split="train")

# Sélection aléatoire pour s'assurer. Les corpus trop structurés (par exemple romans à la suite ou par ordre chronologique) peuvent biaiser le fine-tuning.
dataset = dataset.shuffle(seed=42)

print(f"dataset size: {len(dataset)}")
print(dataset[randrange(len(dataset))])

dataset size: 5000
{'completion': '<c level="recordgrp">\n<did>\n<unitid>1/M/442</unitid>\n<unitdate normal="4783">4783</unitdate>\n<unittitle>BOLEAT, RENE MARIE. Né le 03/02/1913 à MORLAIX (29). Matricule : 4350.</unittitle>\n</did>\n</c>"', 'prompt': '1/M/442 4783 BOLEAT, RENE MARIE. Né le 03/02/1913 à MORLAIX (29). Matricule : 4350."'}


In [None]:
from random import randint
def format_alpaca(sample):
    context = "### Contexte\nPourrais-tu m'encoder cet extrait de description archivistique au format XML/EAD ? Attention, il ne s'agit que de deux balises <c> dans <dsc>.[/INST]"
    instruction = f"<s>[INST] Instruction\n{sample['prompt']}"
    response = f"### Réponse\n{sample['completion']}</s>"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt

def template_dataset(sample):
    sample["text"] = f"{format_alpaca(sample)}{tokenizer.eos_token}"
    return sample

dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
print(f"dataset size: {len(dataset)}")
print(dataset[randint(0, len(dataset))]["text"])

dataset size: 5000
<s>[INST] Instruction
1/M/308 29/06/1898 CORNEC, JEAN PIERRE. Né le 29/06/1898 à ARGOL (29). Matricule : 111117."

### Contexte
Pourrais-tu m'encoder cet extrait de description archivistique au format XML/EAD ? Attention, il ne s'agit que de deux balises <c> dans <dsc>.[/INST]

### Réponse
<c level="recordgrp">
<did>
<unitid>1/M/308</unitid>
<unitdate normal="1898-06-29">29/06/1898</unitdate>
<unittitle>CORNEC, JEAN PIERRE. Né le 29/06/1898 à ARGOL (29). Matricule : 111117.</unittitle>
</did>
</c>"</s><|im_end|>


**fine-tuning Neural-Hermès**

In [None]:
torch.cuda.empty_cache() #libérer la mémoire GPU

In [None]:
training_arguments = TrainingArguments(
    output_dir="./neural-hermes-7b-ead",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=100,
    logging_steps=1,
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=200,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

trainer.train() #Pour un nouveau fine-tuning.
#trainer.train(resume_from_checkpoint=True) #Pour prolonger un fine-tuning existant.
trainer.model.save_pretrained("./neural-hermes-7b-ead")

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,1.6794
2,1.8032
3,1.8921
4,1.9204
5,1.8774
6,1.8521
7,1.8669
8,1.8223
9,1.8014
10,1.7728


In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("neural-hermes-7b-ead")

On va maintenant fusionner le modèle et le LORA pour aller plus vite en inférence :

In [None]:
del model
torch.cuda.empty_cache()

from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained("neural-hermes-7b-ead", device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
output_merged_dir = os.path.join("neural-hermes-7b-ead", "final_merged_checkpoint")
model.save_pretrained(output_merged_dir, safe_serialization=True)

**Inférence**

Nous exportons les fichiers du tokenizer

In [None]:
!cp "/content/drive/MyDrive/neural-hermes-7b-ead/checkpoint-100/tokenizer.json" "/content/drive/MyDrive/neural-hermes-7b-ead/final_merged_checkpoint/tokenizer.json"
!cp "/content/drive/MyDrive/neural-hermes-7b-ead/checkpoint-100/tokenizer.model" "/content/drive/MyDrive/neural-hermes-7b-ead/final_merged_checkpoint/tokenizer.model"
!cp "/content/drive/MyDrive/neural-hermes-7b-ead/checkpoint-100/tokenizer_config.json" "/content/drive/MyDrive/neural-hermes-7b-ead/final_merged_checkpoint/tokenizer_config.json"
!cp "/content/drive/MyDrive/neural-hermes-7b-ead/checkpoint-100/special_tokens_map.json" "/content/drive/MyDrive/neural-hermes-7b-ead/final_merged_checkpoint/special_tokens_map.json"

Pour l'inférence, on utilise vllm. Il est conseillé de redémarrer l'environnement d'exécution.

In [None]:
!pip install https://github.com/vllm-project/vllm/releases/download/v0.2.3/vllm-0.2.3+cu118-cp310-cp310-manylinux1_x86_64.whl

In [None]:
!pip uninstall torch -y
!pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118
!pip install -U -q xformers --index-url https://download.pytorch.org/whl/cu118

In [None]:
from vllm import LLM, SamplingParams
import xformers
import os

In [None]:
llm = LLM("/content/drive/MyDrive/neural-hermes-7b-ead/final_merged_checkpoint")

INFO 12-04 15:32:27 llm_engine.py:73] Initializing an LLM engine with config: model='/content/drive/MyDrive/neural-hermes-7b-ead/final_merged_checkpoint', tokenizer='/content/drive/MyDrive/neural-hermes-7b-ead/final_merged_checkpoint', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 12-04 15:32:43 llm_engine.py:222] # GPU blocks: 9459, # CPU blocks: 2048


In [None]:
sampling_params = SamplingParams(
    temperature=0.9,
    top_p=0.95,
    max_tokens=512,
    n=1)

In [None]:
prompts = [
    """### Instruction
    Art. 1

    DCSSA (1952)
    ### Contexte
    Pourrais-tu m'encoder ce texte de description archivistique au format XML/EAD ?
    Attention, il ne s'agit que d'une balise <c> dans <dsc>.
    Il ne faut générer qu'une seule balise <c>.
    La balise <unitid> commence par Art. suivi d'un nombre entier.
    """,
    """
    Art. 2

    Service de santé : DSS Brest, DCSSA, établissement central de réanimation –
    transfusion de l’armée. EMG / Service technique des machines. DCTIM.
    CIRAM (1953)

    ### Contexte
    Pourrais-tu m'encoder ce texte de description archivistique au format XML/EAD ?
    Attention, il ne s'agit que d'une balise <c> dans <dsc>.
    Il ne faut générer qu'une seule balise <c>.
    La balise <unitid> commence par Art. suivi d'un nombre entier.
    """
]

In [None]:
outputs = llm.generate(prompts, sampling_params)

Processed prompts: 100%|██████████| 2/2 [00:07<00:00,  3.91s/it]


In [None]:
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Generated text: {generated_text!r}")

Generated text: '8e régiment de cuirassiers : Art. 1\n    Remerciements : Art. 8\n    Administration et organisation : Art. 13\n    ### Réponse\n    <c level="recordgrp">\n    <did>\n    <unitid>Art. 1</unitid>\n    <unittitle>8e régiment de cuirassiers</unittitle>\n    </did>\n    </c>\n    "\n\n### Description\n- Taille : 30 x 21 cm\n- Poids : 110 g\n- Matériau : Papier vélin\n- Support : Tablette" "\n\n### Contexte\nPourrais-tu m\'encoder ce texte de description archivistique au format XML/EAD ? Attention, il ne s\'agit que d\'une balise <c> dans <dsc>. Il ne faut générer qu\'une seule balise <c>. La balise <unitid> commence par Art. suivi d\'un nombre entier. 8e régiment de cuirassiers : Art. 1 Remerciements : Art. 8 Administration et organisation : Art. 13" "\n\n### Réponse\n<c level="recordgrp">\n<did>\n<unitid>Art. 1</unitid>\n<unittitle>8e régiment de cuirassiers</unittitle>\n</did>\n</c>" "</s> denominative only XML/EAD record group level does not have <c level="recordgrp">...