In [None]:
!pip install datasets peft bitsandbytes
!pip install -U bitsandbytes

In [2]:
import pandas as pd
from datasets import load_dataset
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer
from transformers import LlamaTokenizer, LlamaForCausalLM
from peft import LoraConfig, get_peft_model
from transformers import BitsAndBytesConfig

In [3]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0
)

In [4]:
from huggingface_hub import login

# Token de huggingface
login('')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
MODEL_NAME = 'meta-llama/Llama-3.2-3B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=quantization_config, device_map='auto')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Módulos a los que se aplicará LoRA
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

In [7]:
tokenizer.pad_token = tokenizer.eos_token

In [8]:
dataset = load_dataset('csv', data_files='q_a_db.csv')

In [29]:
dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'question', 'answer', 'category'],
        num_rows: 1143
    })
})

In [9]:
def tokenize_function(example):
    # Concatenar prompt y respuesta
    full_text = example['question'] + example['answer']

    # Tokenizar
    tokenized_example = tokenizer(
        full_text,
        truncation=True,
        padding='max_length',
        max_length=500,
    )

    # Crear etiquetas
    labels = tokenized_example['input_ids'].copy()

    # Calcular la longitud del prompt
    prompt_length = len(tokenizer(
        example['question'],
        add_special_tokens=False
    )['input_ids'])

    # Enmascarar los tokens del prompt
    labels[:prompt_length] = [-100] * prompt_length

    tokenized_example['labels'] = labels
    return tokenized_example


tokenized_dataset = dataset.map(tokenize_function, batched=False)


In [10]:
training_args = TrainingArguments(
    output_dir='./resultado_lora',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=16,
    num_train_epochs=5,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
)
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
10,3.2017
20,0.1154
30,0.0934
40,0.0864
50,0.0833
60,0.0825
70,0.0773
80,0.081


TrainOutput(global_step=85, training_loss=0.45400116724126477, metrics={'train_runtime': 1482.6649, 'train_samples_per_second': 3.855, 'train_steps_per_second': 0.057, 'total_flos': 4.6005539106816e+16, 'train_loss': 0.45400116724126477, 'epoch': 4.755244755244755})

In [12]:
model.save_pretrained('llama-3.2-3b-fine-tunnig')
tokenizer.save_pretrained('llama-3.2-3b-fine-tunnig')

('llama-3.2-3b-fine-tunnig/tokenizer_config.json',
 'llama-3.2-3b-fine-tunnig/special_tokens_map.json',
 'llama-3.2-3b-fine-tunnig/tokenizer.json')

### Evaluación

In [13]:
from peft import PeftModel
import torch

In [18]:
model_name = './llama-3.2-3b-fine-tunnig/' #'meta-llama/Llama-3.2-3B-Instruct' #

# Cargar el tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configuración de cuantización
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0
)

# Cargar el modelo con cuantización
model = LlamaForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map='auto'
)

# # Carga
# model = PeftModel.from_pretrained(model, model_name)

model.eval()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): lora.Linear8bitLt(
            (base_layer): Linear8bitLt(in_features=3072, out_features=3072, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=3072, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=3072, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear8bitLt(in_features=3072, out_features=1024, bias=False)
          (v_proj): lora.Linear8bitLt(
            (base_layer): Linear8bitLt(in_

In [22]:
# Prompt de entrada
prompt = """Qué necesito para un sistema de pagos en México?"""

# Tokenizar
inputs = tokenizer(prompt, return_tensors='pt')

# Mover los tensores
inputs = {key: value.to(model.device) for key, value in inputs.items()}

# Generar la respuesta
with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=1000,    # Número máximo de tokens a generar
        do_sample=True,
        temperature=0.1,
        top_p=0.9,
        top_k=50,
        repetition_penalty=1.2
    )

# Decodificar
respuesta = tokenizer.decode(output[0], skip_special_tokens=True)

# Extraer
respuesta_generada = respuesta[len(prompt):].strip()

print("Respuesta del modelo:")
print(respuesta_generada)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Respuesta del modelo:
Licencia del Banco de México y autorización de la Comisión Nacional Bancaria.
