* #### **Hay que usar GPU**

* #### **datasets: para dar el formato de bases de datos de HuggingFace**

* #### **accelerate: acelera el entrenamiento de los modelos**

Datos de Kaggle:

https://www.kaggle.com/datasets/gondimalladeepesh/nvidia-documentation-question-and-answer-pairs

In [None]:
!pip install datasets accelerate --quiet

In [None]:
# Desactivamos WandB que se activa con Trainer

import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
from huggingface_hub import login
from google.colab import userdata

In [None]:
login(userdata.get('miHuggingFace'))

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
import torch
import re

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
data = pd.read_csv('NvidiaDocumentationQandApairs.csv')[["question","answer"]]

print(data.shape)
data.head()

(7108, 2)


Unnamed: 0,question,answer
0,What is Hybridizer?,Hybridizer is a compiler from Altimesh that en...
1,How does Hybridizer generate optimized code?,Hybridizer uses decorated symbols to express p...
2,What are some parallelization patterns mention...,The text mentions using parallelization patter...
3,How can you benefit from accelerators without ...,You can benefit from accelerators' compute hor...
4,What is an example of using Hybridizer?,An example in the text demonstrates using Para...


In [None]:
# Limpiamos un poco los datos:

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www\S+", "", text)
    text = re.sub("[^A-Za-z0-9]+", " ", text)
    return text

data["question"] = data["question"].apply(clean_text)
data["answer"] = data["answer"].apply(clean_text)

In [None]:
data.head()

Unnamed: 0,question,answer
0,what is hybridizer,hybridizer is a compiler from altimesh that en...
1,how does hybridizer generate optimized code,hybridizer uses decorated symbols to express p...
2,what are some parallelization patterns mention...,the text mentions using parallelization patter...
3,how can you benefit from accelerators without ...,you can benefit from accelerators compute hors...
4,what is an example of using hybridizer,an example in the text demonstrates using para...


In [None]:
# Particionamos en Train (70%), Val(15%) y Test(15%):

train =data.sample(frac=0.7, random_state=17)  # 70% para Train
val = data.drop(train.index).sample(frac=0.5, random_state=17)  # Borramos Train de data y del resto, 50% Val
test = data.drop(train.index).drop(val.index) # Borramos Train y Val de data y lo que queda es Test

print('Dimensiones de la partición:')
print('Train:', train.shape)
print('Val', val.shape)
print('Test', test.shape )

Dimensiones de la partición:
Train: (4976, 2)
Val (1066, 2)
Test (1066, 2)


In [None]:
llm_model = "google/flan-t5-base"

auto_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model, torch_dtype=torch.bfloat16)  # torch.bfloat16, torch.float32

tokenizer = AutoTokenizer.from_pretrained(llm_model)

In [None]:
def tokenize_function(example):
    start_prompt = 'For this question\n\n'
    end_prompt = '\nThe answer is:\n\n'

    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["question"]]

    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt", max_length=200).input_ids
    example['labels'] = tokenizer(example["answer"], padding="max_length", truncation=True, return_tensors="pt",max_length=200).input_ids

    return example

In [None]:
tmp = Dataset.from_pandas(train)
tt = tmp.map(tokenize_function, batched=True)

Map:   0%|          | 0/4976 [00:00<?, ? examples/s]

In [None]:
tt

Dataset({
    features: ['question', 'answer', '__index_level_0__', 'input_ids', 'labels'],
    num_rows: 4976
})

In [None]:
tt['question'][0]

'how does the performance of different reduction algorithms compare on the kepler gpu architecture '

In [None]:
tt['input_ids'][0][0:7]

[2150, 12, 8, 826, 822, 149, 405]

In [None]:
tt['input_ids'][10][0:7]

[2150, 12, 8, 826, 822, 149, 54]

In [None]:
train_data = Dataset.from_pandas(train)
train_tokenized_datasets = train_data.map(tokenize_function, batched=True)
train_tokenized_datasets = train_tokenized_datasets.remove_columns(['question', 'answer','__index_level_0__'])


val_data = Dataset.from_pandas(val)
val_tokenized_datasets = val_data.map(tokenize_function, batched=True)
val_tokenized_datasets = val_tokenized_datasets.remove_columns(['question', 'answer','__index_level_0__'])


test_data = Dataset.from_pandas(test)
test_tokenized_datasets = test_data.map(tokenize_function, batched=True)
test_tokenized_datasets = test_tokenized_datasets.remove_columns(['question', 'answer','__index_level_0__'])

Map:   0%|          | 0/4976 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [None]:
train_tokenized_datasets

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 4976
})

### **Entrenamiento (Training)**

In [None]:
EPOCHS = 2   # épocas a entrenar
LR = 1e-3    # learning rate
BATCH_SIZE = 2  # tamaño de batch a entrenar/evaluar

# definimos la ruta donde guardaremos los chekpoints del entrenamiento:
training_path = './training_nvidia_chatbot'

training_args = TrainingArguments(
    output_dir = training_path,
    overwrite_output_dir = True,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    learning_rate = LR,
    num_train_epochs = EPOCHS,
    eval_strategy = "epoch",
    save_total_limit = 2,    # total de checkpoints a guardar
    report_to="none"  # Evita que Hugging Face use WandB
    )


trainer= Trainer(
    model = auto_model,
    args = training_args,
    train_dataset = train_tokenized_datasets,
    eval_dataset = val_tokenized_datasets,
)


trainer.train()

model_path = "./nvidia-chatbot-final-model"

trainer.model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Epoch,Training Loss,Validation Loss
1,0.4829,0.429522


In [None]:
eval_results= trainer.evaluate(eval_dataset = test_tokenized_datasets)

In [None]:
print(eval_results)