# affiner Llama 2 pour les données de djezzy


Google Colab gratuit propose une carte graphique de 15 Go (ressources limitées -> à peine assez pour stocker les poids de Llama 2–7b)

Nous devons également prendre en compte la surcharge due aux états de l'optimiseur, aux gradients et aux activations directes.

Un réglage fin complet n'est pas possible ici : nous avons besoin de techniques de réglage fin efficaces (PEFT) comme LoRA .



#Étape 1 : installation de tous les packages requis

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

#Étape 2 : Importer toutes les bibliothèques requises

In [None]:
import os

import random
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from transformers import pipeline


#traiter les données

In [None]:
import pandas as pd

# Chemin vers le fichier texte de nos données
chemin_fichier = "/content/englishdata.txt"

# Initialiser des listes pour stocker les questions et réponses
questions = []
reponses = []

# Lire le fichier et extraire les paires question-réponse
with open(chemin_fichier, "r") as fichier:
    lines = fichier.readlines()
    # Parcourir chaque ligne du fichier
    lines = [line.strip() for line in lines if line.strip()]
    lines = [line.replace('"', '').replace("'", '').replace('\\', ''.replace(",", ' ')) for line in lines]
    i = 0






    while i < len(lines):




            # Extraire la question et la réponse correspondante
            question = lines[i].strip()
            reponse = lines[i+1].strip()
            # Ajouter la question et la réponse aux listes correspondantes
            questions.append(question)
            reponses.append(reponse)
            # Passer à la prochaine paire question-réponse
            i += 2
# Créer un DataFrame à partir des listes de questions et de réponses
df = pd.DataFrame({
    'prompt': questions,
    'response': reponses
})

train_df=df
#stocker les paires de question réponse dons un fichier Json pour fin tuning
train_df.to_json('train.jsonl', orient='records', lines=True,force_ascii=False)
print(len(questions))
print(len(reponses))

df.head()


1227
1227


Unnamed: 0,prompt,response
0,What are the advantages of the Djezzy Legend 2...,"For just 2500 DA, youll receive 100 GB of data..."
1,What perks come with the Djezzy Legend 2 000 D...,"For only 2,000 DA, you get 70 GB of data, nati..."
2,What benefits does the Djezzy Legend 1 500 DA ...,"With the 1,500 DA plan, youll enjoy 40 GB of d..."
3,What advantages are included in the Djezzy Leg...,"For just 1,000 DA, you receive 15 GB of data, ..."
4,What merits does the Djezzy Legend 150 DA pack...,"At only 150 DA, you get unlimited access to Fa..."


#Charger le modèle Llama 2

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Le modèle que vous souhaitez entraîner à partir du hub Hugging Face
model_name = "meta-llama/Llama-2-7b-chat-hf"

# L'ensemble de données d'instructions à utiliser
dataset_name = "/content/train.jsonl"

# Nom du modèle affiné
new_model = "llama-2-7b-custom"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 8

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activer le chargement du modèle de base de précision 4 bits
use_4bit = True

# Type de calcul pour les modèles de base 4 bits
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activer la quantification imbriquée pour les modèles de base 4 bits (double quantification)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################


output_dir = "./results"

# Nombre d'époques de formation
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -3

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = 700

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

# Charger les ensembles de données et entraîner

In [None]:
# charger les données
from datasets import load_dataset
train_dataset = load_dataset('json', data_files='train.jsonl', split="train")


# Prétraiter les ensembles de données
system_message="Please provide a concise response to the question:"
train_dataset_mapped = train_dataset.map(lambda examples: {'text': [f'[INST] <<SYS>>\n{system_message.strip()}\n<</SYS>>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)
train_dataset_mapped = train_dataset_mapped.shuffle()
print(train_dataset_mapped)
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# charger le model de base
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# charger LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training




Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1227 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'response', 'text'],
    num_rows: 1227
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
print(train_dataset_mapped[1])

{'prompt': 'Quel est le coût des Écouteurs EW27 de HOCO ?', 'response': 'Les Écouteurs EW27 sont disponibles au prix de 3 400 DA.', 'text': "[INST] <<SYS>>\nvous etes un chatbot  de l'entreprise djezzy juste reponder au question qui ont une relation avec djezzy et repondre a la question poser ,ne donner pas des autre  information qui ne mentionne pas dans la question   .Tout autre sujet en lien avec Djezzy et ses activités. sinon repondre avec ce message :j'ai pas une idée. repondre a cette question en langue francaise\n<</SYS>>\n\nQuel est le coût des Écouteurs EW27 de HOCO ? [/INST] Les Écouteurs EW27 sont disponibles au prix de 3 400 DA."}


In [None]:


# Charger les configurations de LoRA
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

#Définir les paramètres d'entraînement
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Définir les paramètres de réglage fin supervisés
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_mapped,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)



Map:   0%|          | 0/1227 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,3.2069
50,1.5036
75,1.4082
100,1.1554
125,1.1864
150,1.0649
175,1.154
200,1.0067
225,1.1952
250,0.9489


In [None]:
# Empty VRAM
del model
#del pipe

import gc
gc.collect()
gc.collect()

0

#Fusionner le modèle et stocker dans Drive

In [None]:

from google.colab import drive



model_path="/content/llama-2-7b-custom"
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, model_path)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 

stocker le model dans notre drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

model_path = "/content/drive/MyDrive/llama-2-7b-custom"

# Save the merged model
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

KeyboardInterrupt: 

#charger le modèle de réglage fin dans le hugingface

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!huggingface-cli login

model.push_to_hub("MoinFaisal/llama-2-7b-custom-djezzy", check_pr=True)

tokenizer.push_to_hub("MoinFaisal/llama-2-7b-custom-djezzy",check_pr=True)



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the '

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MoinFaisal/llama-2-7b-custom/commit/8369caabf87e8fbb6588b1d071b1d9ee0f4070b2', commit_message='Upload tokenizer', commit_description='', oid='8369caabf87e8fbb6588b1d071b1d9ee0f4070b2', pr_url=None, pr_revision=None, pr_num=None)