### Preparation du dataset


In [1]:
import pandas as pd
from datasets import Dataset

# Charger le dataset CSV
csv_path = "data/train.csv" 
df = pd.read_csv(csv_path)

# Convertir en Dataset Hugging Face
dataset = Dataset.from_pandas(df)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(dataset[0])  # Affiche le premier exemple


{'answer': 'Yes, you can format the output text in Bash to make it bold. Bash allows you to use special escape sequences for text decoration. To make some text bold in bash, you would use the escape sequence `\\033[1m`, and to reset the formatting, you would use `\\033[0m`. \r\n\r\nHere\'s how you can update your `echo` statement to print bold text:\r\n\r\n```bash\r\necho -e "\\033[1mSome Text\\033[0m"\r\n```\r\n\r\nIn this code:\r\n\r\n- The `-e` option of `echo` allows the interpretation of backslash escapes.\r\n- The `\\033[1m` sequence sets the text to be bold.\r\n- The `Some Text` part is the actual text that will be printed in bold.\r\n- The `\\033[0m` sequence resets the text formatting to the default, so anything printed afterwards will be in the default format.\r\n\r\nRemember that these escape sequences may not work in all terminals and circumstances, but they should work in most common situations.', 'question': 'How can I output bold text in Bash? I have a Bash script that p

### Préparer la tokenisation

In [3]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Charger le dataset (remplacez par votre méthode de chargement si nécessaire)
# Exemple : Dataset avec les colonnes "question" et "answer"
dataset = load_dataset("csv", data_files="./data/train.csv")["train"]

# Charger le tokenizer du modèle LLaMA
tokenizer = AutoTokenizer.from_pretrained("C:\\Users\\DELL\\Desktop\\nlp\\llama-model", use_fast=False)

# Vérifiez si le tokenizer a un token de padding
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Utiliser le token eos comme pad_token

# Fonction de prétraitement pour la tokenisation
def preprocess_function(examples):
    # Validation des questions et réponses
    inputs = [
        f"Question: {q} Contexte: Réponse:" if q else "Question: [VIDE] Contexte: Réponse:"
        for q in examples["question"]
    ]
    targets = [a if a else "[VIDE]" for a in examples["answer"]]  # Placeholder pour réponses vides

    # Tokenisation des entrées
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenisation des cibles
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")["input_ids"]

    # Ajouter les labels
    model_inputs["labels"] = labels
    return model_inputs

# Supprimer les exemples invalides (questions ou réponses vides)
dataset = dataset.filter(lambda x: x["question"] and x["answer"])

# Appliquer la tokenisation au dataset complet
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Afficher un exemple tokenisé pour vérification
print(tokenized_datasets[0])


{'answer': 'Yes, you can format the output text in Bash to make it bold. Bash allows you to use special escape sequences for text decoration. To make some text bold in bash, you would use the escape sequence `\\033[1m`, and to reset the formatting, you would use `\\033[0m`. \r\n\r\nHere\'s how you can update your `echo` statement to print bold text:\r\n\r\n```bash\r\necho -e "\\033[1mSome Text\\033[0m"\r\n```\r\n\r\nIn this code:\r\n\r\n- The `-e` option of `echo` allows the interpretation of backslash escapes.\r\n- The `\\033[1m` sequence sets the text to be bold.\r\n- The `Some Text` part is the actual text that will be printed in bold.\r\n- The `\\033[0m` sequence resets the text formatting to the default, so anything printed afterwards will be in the default format.\r\n\r\nRemember that these escape sequences may not work in all terminals and circumstances, but they should work in most common situations.', 'question': 'How can I output bold text in Bash? I have a Bash script that p

### Charger le modéle 

In [4]:
from transformers import AutoModelForCausalLM

# Charger le modèle
model = AutoModelForCausalLM.from_pretrained("./llama-model")


Loading checkpoint shards: 100%|██████████| 2/2 [00:34<00:00, 17.41s/it]


### 4. Configuration des paramètres d'entraînement


In [5]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./llama_qa_finetuned",  # Répertoire où enregistrer les checkpoints
    evaluation_strategy="steps",       # Évaluer après un certain nombre d'étapes
    eval_steps=500,                    
    learning_rate=5e-5,                # Taux d'apprentissage
    per_device_train_batch_size=4,     # Taille des lots d'entraînement
    per_device_eval_batch_size=4,      # Taille des lots d'évaluation
    num_train_epochs=3,                # Nombre d'époques
    save_steps=1000,                   # Sauvegarder le modèle tous les 1000 steps
    save_total_limit=2,                # Limiter à 2 sauvegardes
    logging_dir="./logs",              # Répertoire pour les logs
    logging_steps=10,                  # Fréquence de logging
    fp16=True,                         # Précision mixte pour les GPU
)







### Configurer le trainer 

In [6]:
from transformers import Trainer, TrainingArguments
from datasets import DatasetDict

# Diviser le dataset en train et validation
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)

# Renommer "test" en "validation" pour Trainer
tokenized_datasets = DatasetDict({
    "train": tokenized_datasets["train"],
    "validation": tokenized_datasets["test"]
})

# Supprimer les colonnes inutiles si elles existent
tokenized_datasets = tokenized_datasets.map(lambda examples: {k: examples[k] for k in ["input_ids", "attention_mask", "labels"]})

# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500
)

# Initialiser le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],  # Dataset d'entraînement
    eval_dataset=tokenized_datasets["validation"],  # Dataset d'évaluation
    tokenizer=tokenizer,
)


Map: 100%|██████████| 108886/108886 [02:17<00:00, 792.10 examples/s] 
Map: 100%|██████████| 27222/27222 [00:29<00:00, 935.91 examples/s] 
  trainer = Trainer(


In [7]:
#AFFICHER LES DATASETS
print(tokenized_datasets["train"][0])  # Vérifiez les colonnes
print(tokenized_datasets["validation"][0])  # Vérifiez les colonnes


{'answer': "Sure! Here's an example code that solves your problem:\r\n\r\n```python\r\ndef largestFibonacciNumber(n):\r\n    # Initialize the starting two fibonacci numbers\r\n    a = 0\r\n    b = 1\r\n    \r\n    # Iterate over every index\r\n    while b <  n:\r\n        temp = a\r\n        a = b\r\n        b = temp + b \r\n    \r\n    return a\r\n\r\n# Test Cases \r\nprint(largestFibonacciNumber(10)) \r\n# output: 8 \r\n\r\nprint(largestFibonacciNumber(50)) \r\n# output: 34\r\n```\r\n\r\nIn this code, we define a function called `largestFibonacciNumber` that takes an integer `n` as input. It initializes two variables `a` and `b` to represent the first and second Fibonacci numbers respectively, which are 0 and 1.\r\n\r\nThen, using a `while` loop, we iterate while `b` is less than the given input `n`. Inside the loop, we update the values of `a` and `b` to be the next Fibonacci number by swapping `a` with `b` and updating `b` to the sum of the previous `a` and `b`.\r\n\r\nFinally, whe

: 

In [8]:
#lancer l'entrainement 
model.to("cpu")

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33manisbenini[0m ([33manisbenini-universit-mouloud-mammeri-de-tizi-ouzou[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/40833 [00:00<?, ?it/s]

### Saving

In [None]:
trainer.save_model("./llama_qa_finetuned")

### Testing the modele

In [None]:
from transformers import pipeline

# Charger le modèle fine-tuné
qa_pipeline = pipeline("text2text-generation", model="./llama_qa_finetuned", tokenizer=tokenizer)

# Tester avec une question
question = "Comment formater du texte en gras dans Bash ?"
context = ""  # Ajoutez un contexte si nécessaire
input_text = f"Question: {question} Contexte: Réponse:"
response = qa_pipeline(input_text, max_length=100)

print("Réponse générée:", response[0]["generated_text"])
