## Notebook Login 

In [1]:
from huggingface_hub import login

# Replace with your Hugging Face token
#hf_token = ""
login(token=hf_token)

  from .autonotebook import tqdm as notebook_tqdm


## Inference on pretrained model 

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

#model_name = "google/gemma-2b"
model_name = "meta-llama/Llama-2-7b-hf"

model = AutoModelForCausalLM.from_pretrained(model_name,
                                            #  torch_dtype=torch.float16
                                             )
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          # torch_dtype=torch.float16
                                          )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:53<00:00, 26.69s/it]


In [4]:
input_text = "Instruction:\nEres un asistente virtual de cobranza, llamas de parte de SMARTIA por encargo del Banco Falabela para gestionar cobranza\n\nInput:\nusuario: inicio_llamada"

input_ids = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**input_ids, max_length=128)
print(tokenizer.decode(outputs[0]))

<s> Instruction:
Eres un asistente virtual de cobranza, llamas de parte de SMARTIA por encargo del Banco Falabela para gestionar cobranza

Input:
usuario: inicio_llamada

Output:


</s>


## PEFT

### Imports

In [5]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer


### Create Dataset

In [8]:
from datasets import load_dataset

dataset_name = "databricks/databricks-dolly-15k"
dataset = load_dataset(dataset_name, split="train")

In [9]:
from collections import defaultdict

categries_count = defaultdict(int)
for __, data in enumerate(dataset):
    categries_count[data['category']] += 1
print(categries_count)

defaultdict(<class 'int'>, {'closed_qa': 1773, 'classification': 2136, 'open_qa': 3742, 'information_extraction': 1506, 'brainstorming': 1766, 'general_qa': 2191, 'summarization': 1188, 'creative_writing': 709})


In [10]:
# filter out those that do not have any context
filtered_dataset = []
for __, data in enumerate(dataset):
    if data["context"]:
        continue
    else:
        text = f"Instruction:\n{data['instruction']}\n\nResponse:\n{data['response']}"
        filtered_dataset.append({"text": text})

print(filtered_dataset[0:2])

[{'text': 'Instruction:\nWhich is a species of fish? Tope or Rope\n\nResponse:\nTope'}, {'text': 'Instruction:\nWhy can camels survive for long without water?\n\nResponse:\nCamels use the fat in their humps to keep them filled with energy and hydration for long periods of time.'}]


In [11]:
# convert to json and save the filtered dataset as jsonl file
import jsonlines as jl
with jl.open('dolly-mini-train.jsonl', 'w') as writer:
    writer.write_all(filtered_dataset[0:])

In [12]:
from datasets import load_dataset

dataset_name = "ai-bites/databricks-mini"
dataset = load_dataset(dataset_name, split="train[0:1000]")
dataset

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [None]:
from datasets import Dataset

# Ruta al archivo JSONL convertido
dataset_path = "converted_dataset.jsonl"

# Cargar el dataset desde el archivo JSONL
dataset = Dataset.from_json(dataset_path)

# Visualizar un ejemplo del dataset cargado
print(dataset[0])  # Muestra el primer ejemplo

{'text': 'Instruction:\nEres un asistente virtual de cobranza, llamas de parte de SMARTIA por encargo del Banco Falabela para gestionar cobranza\n\nInput:\nusuario: inicio_llamada\n\nResponse:\nBuenas tardes, soy su agente virtual de falabela. ¿Hablo con María González?'}


In [7]:
dataset

Dataset({
    features: ['text'],
    num_rows: 372
})

### Define parameters 

In [8]:
# define some variables - model names
model_name = "meta-llama/Llama-2-7b-chat-hf"
new_model = "Llama-2-7b-chat-hf-ft"

################################################################################
# LoRA parameters
################################################################################
# LoRA attention dimension
# lora_r = 64
lora_r = 16
# Alpha parameter for LoRA scaling
lora_alpha = 32
# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = False
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = None
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"
# Number of training epochs
num_train_epochs = 1
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False
# Batch size per GPU for training
per_device_train_batch_size = 8
# Batch size per GPU for evaluation
per_device_eval_batch_size = 8
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1
# Enable gradient checkpointing
gradient_checkpointing = True
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True
# Save checkpoint every X updates steps
save_steps = 25
# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################
# Maximum sequence length to use
max_seq_length = 40 # None
# Pack multiple short examples in the same input sequence to increase efficiency
packing =  False
# Load the entire model on the GPU 0
# device_map = {"": 0}
device_map="auto"


In [9]:
# Load QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit, # Activates 4-bit precision loading
    bnb_4bit_compute_dtype=compute_dtype, # float16
    bnb_4bit_use_double_quant=use_nested_quant, # False
)

In [10]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("Setting BF16 to True")
        bf16 = True
    else:
        bf16 = False


### Load model and tokenizer

In [11]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=hf_token,
    quantization_config=bnb_config
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          token=hf_token,
                                          trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training


`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [02:03<00:00, 61.66s/it]


In [12]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="all",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj", "act_proj"]
)

In [13]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
)
training_arguments

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
eval_use_gather_object=F

In [14]:

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_arguments,
)


  trainer = SFTTrainer(


### Training


In [22]:
# Train model
trainer.train()
trainer.model.save_pretrained(new_model)

Step,Training Loss
25,1.2599


# Parameter tuning 

In [18]:
import itertools
import json

# Define the parameter grid
param_grid = {
    "num_train_epochs":[1,3,5],
    "lora_alpha":[32,64],
    "lora_r": [32, 64],          # LoRA attention dimension
    "bias": ["none", "all", "lora_only"],  # Different bias configurations
}

In [19]:
# Create all combinations of parameters
param_combinations = list(itertools.product(*param_grid.values()))

# Directory to save results
results_file = "lora_training_results.json"
results = []

### Prompt model function 

In [20]:
def prompt_model(input, ftmodel):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        low_cpu_mem_usage=False,
        return_dict=True,
        torch_dtype=torch.float16
    )
    model = PeftModel.from_pretrained(base_model, ftmodel)
    model = model.merge_and_unload().to(device)

    # Reload tokenizer to save it
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"


    input_ids = tokenizer(input, return_tensors="pt").to(device)
    outputs = model.generate(**input_ids, max_length=50)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

### Parameter test loop 

In [21]:
import json
from transformers import pipeline
import gc
import torch 

prompts = [
    "Instruction:\nEres un asistente virtual de cobranza, llamas de parte de SMARTIA por encargo del Banco Falabela para gestionar cobranza\n\nInput:\nusuario: inicio_llamada",
    "Instruction:\nEres un asistente virtual de cobranza, llamas de parte de SMARTIA por encargo del Banco Falabela para gestionar cobranza\n\nInput:\nusuario: inicio_llamada\nasistente: Buenas tardes, soy su agente virtual de falabela. ¿Hablo con María González?\nusuario: Hola, buenas tardes. Sí, yo soy María. ¿En qué puedo ayudarte?Instruction:\nEres un asistente virtual de cobranza, llamas de parte de SMARTIA por encargo del Banco Falabela para gestionar cobranza\n\nInput:\nusuario: inicio_llamada\nasistente: Buenas tardes, soy su agente virtual de falabela. ¿Hablo con María González?\nusuario: Hola, buenas tardes. Sí, yo soy María. ¿En qué puedo ayudarte?",
    "Instruction:\nEres un asistente virtual de cobranza, llamas de parte de SMARTIA por encargo del Banco Falabela para gestionar cobranza\n\nInput:\nusuario: inicio_llamada\nasistente: Buenas tardes, soy su agente virtual de falabela. ¿Hablo con María González?\nusuario: Hola, buenas tardes. Sí, yo soy María. ¿En qué puedo ayudarte?\nasistente: Lo estoy llamando de SMARTIA por encargo de falabela. Por su seguridad, esta conversación podría ser grabada. El motivo de mi llamada es para informarle que al día de hoy usted mantiene una deuda por un monto de 150000 pesos. El plazo para regularizar esta deuda es hasta el 15 de noviembre del 2024, ¿está de acuerdo?\nusuario: Sí, estoy al tanto de la deuda, pero la verdad es que he tenido algunos problemas recientemente y no he podido hacer los pagos. ¿Podrías enviarme más detalles de la deuda por correo electrónico para que pueda revisarlo más tarde? Ahora mismo estoy un poco ocupada.",
    "Instruction:\nEres un asistente virtual de cobranza, llamas de parte de SMARTIA por encargo del Banco Falabela para gestionar cobranza\n\nInput:\nusuario: inicio_llamada\nasistente: Buenas tardes, soy su agente virtual de falabela. ¿Hablo con María González?\nusuario: Hola, buenas tardes. Sí, yo soy María. ¿En qué puedo ayudarte?\nasistente: Lo estoy llamando de SMARTIA por encargo de falabela. Por su seguridad, esta conversación podría ser grabada. El motivo de mi llamada es para informarle que al día de hoy usted mantiene una deuda por un monto de 150000 pesos. El plazo para regularizar esta deuda es hasta el 15 de noviembre del 2024, ¿está de acuerdo?\nusuario: Sí, estoy al tanto de la deuda, pero la verdad es que he tenido algunos problemas recientemente y no he podido hacer los pagos. ¿Podrías enviarme más detalles de la deuda por correo electrónico para que pueda revisarlo más tarde? Ahora mismo estoy un poco ocupada.\nasistente: Disculpe, solo puedo entregar información al titular. ¿Podría confirmarme si hablo con María González?\nusuario: Sí, soy yo, pero como te mencioné antes, estoy en medio de algo importante ahora mismo. ¿Podrías llamarme más tarde o enviarme la información por correo? De esa manera podré revisarlo con calma.",
    "Instruction:\nEres un asistente virtual de cobranza, llamas de parte de SMARTIA por encargo del Banco Falabela para gestionar cobranza\n\nInput:\nusuario: inicio_llamada\nasistente: Buenas tardes, soy su agente virtual de falabela. ¿Hablo con María González?\nusuario: Hola, buenas tardes. Sí, yo soy María. ¿En qué puedo ayudarte?\nasistente: Lo estoy llamando de SMARTIA por encargo de falabela. Por su seguridad, esta conversación podría ser grabada. El motivo de mi llamada es para informarle que al día de hoy usted mantiene una deuda por un monto de 150000 pesos. El plazo para regularizar esta deuda es hasta el 15 de noviembre del 2024, ¿está de acuerdo?\nusuario: Sí, estoy al tanto de la deuda, pero la verdad es que he tenido algunos problemas recientemente y no he podido hacer los pagos. ¿Podrías enviarme más detalles de la deuda por correo electrónico para que pueda revisarlo más tarde? Ahora mismo estoy un poco ocupada.\nasistente: Disculpe, solo puedo entregar información al titular. ¿Podría confirmarme si hablo con María González?\nusuario: Sí, soy yo, pero como te mencioné antes, estoy en medio de algo importante ahora mismo. ¿Podrías llamarme más tarde o enviarme la información por correo? De esa manera podré revisarlo con calma.\nasistente: Lamento informarle que no puedo agendar llamadas en un horario específico. Si no puede hablar ahora, podemos intentar contactarlo otro día. ¿Le gustaría que lo llamemos en otra ocasión?\nusuario: Sí, me parece bien que lo intenten otro día. Esta semana está siendo un poco complicada para mí. Agradezco tu comprensión."
]

# Loop through each combination
for i, param_set in enumerate(param_combinations):
    print(f"Running configuration {i + 1}/{len(param_combinations)}...")
    
    # Unpack parameters
    params = dict(zip(param_grid.keys(), param_set))
    print(f"Parameters: {params}")
    
    # Dynamically set parameters
    num_train_epochs = params["num_train_epochs"]
    lora_r = params["lora_r"]
    bias = params["bias"]
    lora_alpha = params["lora_alpha"]

    # Set training parameters
    training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    )
    training_arguments

    # Load LoRA configuration
    peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias=bias,
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj", "act_proj"]
    )   

    # Set supervised fine-tuning parameters
    trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,   
    tokenizer=tokenizer,
    args=training_arguments,
    )

    try:
        # Train model
        trainer.train()
        trainer.model.save_pretrained(new_model)
        
        prompt_results = {}
        for prompt in prompts:
            prompt_results[prompt] = prompt_model(prompt, new_model)#carga el modelo base para cada pregunta

        # Collect results
        results.append({
            "config_id": i,
            "parameters": params,
            "results": prompt_results, 
        })

    except Exception as e:
        print(f"Error for configuration {i}: {e}")
        results.append({
            "config_id": i,
            "parameters": params,
            "error": str(e),
        })


    finally:
        # Clear GPU memory
        del trainer
        torch.cuda.empty_cache()
        gc.collect()    


    # Save results after each configuration
    with open("llamatestFalabella.json", "w") as f:
        json.dump(results, f, indent=4)

Running configuration 1/36...
Parameters: {'num_train_epochs': 1, 'lora_alpha': 32, 'lora_r': 32, 'bias': 'none'}


  trainer = SFTTrainer(


Error for configuration 0: CUDA out of memory. Tried to allocate 344.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 338.12 MiB is free. Including non-PyTorch memory, this process has 78.80 GiB memory in use. Of the allocated memory 76.26 GiB is allocated by PyTorch, and 2.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Running configuration 2/36...
Parameters: {'num_train_epochs': 1, 'lora_alpha': 32, 'lora_r': 32, 'bias': 'all'}


  trainer = SFTTrainer(
Map: 100%|██████████| 372/372 [00:00<00:00, 4311.75 examples/s]


Error for configuration 1: CUDA out of memory. Tried to allocate 344.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 338.12 MiB is free. Including non-PyTorch memory, this process has 78.80 GiB memory in use. Of the allocated memory 76.26 GiB is allocated by PyTorch, and 2.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Running configuration 3/36...
Parameters: {'num_train_epochs': 1, 'lora_alpha': 32, 'lora_r': 32, 'bias': 'lora_only'}


  trainer = SFTTrainer(
Map: 100%|██████████| 372/372 [00:00<00:00, 7388.22 examples/s]


Error for configuration 2: CUDA out of memory. Tried to allocate 344.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 338.12 MiB is free. Including non-PyTorch memory, this process has 78.80 GiB memory in use. Of the allocated memory 76.26 GiB is allocated by PyTorch, and 2.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Running configuration 4/36...
Parameters: {'num_train_epochs': 1, 'lora_alpha': 32, 'lora_r': 64, 'bias': 'none'}


  trainer = SFTTrainer(


Error for configuration 3: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 84.12 MiB is free. Including non-PyTorch memory, this process has 79.05 GiB memory in use. Of the allocated memory 76.90 GiB is allocated by PyTorch, and 1.66 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Running configuration 5/36...
Parameters: {'num_train_epochs': 1, 'lora_alpha': 32, 'lora_r': 64, 'bias': 'all'}


  trainer = SFTTrainer(


Error for configuration 4: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 84.12 MiB is free. Including non-PyTorch memory, this process has 79.05 GiB memory in use. Of the allocated memory 76.90 GiB is allocated by PyTorch, and 1.66 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Running configuration 6/36...
Parameters: {'num_train_epochs': 1, 'lora_alpha': 32, 'lora_r': 64, 'bias': 'lora_only'}


  trainer = SFTTrainer(


Error for configuration 5: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 84.12 MiB is free. Including non-PyTorch memory, this process has 79.05 GiB memory in use. Of the allocated memory 76.90 GiB is allocated by PyTorch, and 1.66 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Running configuration 7/36...
Parameters: {'num_train_epochs': 1, 'lora_alpha': 64, 'lora_r': 32, 'bias': 'none'}


  trainer = SFTTrainer(


Error for configuration 6: CUDA out of memory. Tried to allocate 344.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 190.12 MiB is free. Including non-PyTorch memory, this process has 78.95 GiB memory in use. Of the allocated memory 76.43 GiB is allocated by PyTorch, and 2.03 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Running configuration 8/36...
Parameters: {'num_train_epochs': 1, 'lora_alpha': 64, 'lora_r': 32, 'bias': 'all'}


  trainer = SFTTrainer(


Error for configuration 7: CUDA out of memory. Tried to allocate 344.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 190.12 MiB is free. Including non-PyTorch memory, this process has 78.95 GiB memory in use. Of the allocated memory 76.43 GiB is allocated by PyTorch, and 2.03 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Running configuration 9/36...
Parameters: {'num_train_epochs': 1, 'lora_alpha': 64, 'lora_r': 32, 'bias': 'lora_only'}


  trainer = SFTTrainer(


Error for configuration 8: CUDA out of memory. Tried to allocate 344.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 338.12 MiB is free. Including non-PyTorch memory, this process has 78.80 GiB memory in use. Of the allocated memory 76.26 GiB is allocated by PyTorch, and 2.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Running configuration 10/36...
Parameters: {'num_train_epochs': 1, 'lora_alpha': 64, 'lora_r': 64, 'bias': 'none'}


  trainer = SFTTrainer(


Error for configuration 9: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 84.12 MiB is free. Including non-PyTorch memory, this process has 79.05 GiB memory in use. Of the allocated memory 76.90 GiB is allocated by PyTorch, and 1.66 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Running configuration 11/36...
Parameters: {'num_train_epochs': 1, 'lora_alpha': 64, 'lora_r': 64, 'bias': 'all'}


  trainer = SFTTrainer(


KeyboardInterrupt: 

# Prompt Fine Tuned model 

In [39]:
input_text = """# system:
Eres un asistente virtual para el Banco Falabela, especializado en procesos de cobranza. Sigues un diagrama de flujo estructurado para comunicarte con los clientes. Tu objetivo es guiar la conversación basándote en las etapas del flujo a continuación. Sé flexible para adaptarte si el usuario se desvía del flujo, pero siempre intenta regresar al proceso principal.

En el flujo se definirán opciones, considera detectar las intenciones del usuario y responder de acuerdo con las opciones disponibles. Si el usuario se desvía, intenta redirigirlo al flujo principal. Recuerda confirmar los datos proporcionados y guiar al usuario hacia un cierre adecuado.

<parametros>
NOMBRE: Matias
APELLIDO1: Pereira
APELLIDO2: Santelices
MONTO_DEUDA: 100000
FECHA: 6 enero 2025
FECHA_LIMITE: 31 enero 2025
PORCENTAJE: 10
PAGO_INICIAL: 10000
N_CUOTAS: 3
MONTO_CUOTAS: 30000
</parametros>

<definicion_flujo>

{
    "FlujoInicial": {
        "Inicio": {
            "mensaje": "Buenos días, soy su agente virtual de Banco Falabella. Hablo con <NOMBRE> <APELLIDO1> <APELLIDO2>.",
            "objetivo": "Validar el contacto y confirmar la identidad del cliente",
            "opciones": {
                "SI, con él": {"ir a":"Modulo_2"},
                "NO, ¿de parte de quién?": {"mensaje":"Soy su agente virtual, ¿Puedo hablar con <NOMBRE> en este momento?. Tenemos una información comercial importante de sus productos del Banco Falabela, que se comunique con Banco Falabela o directamente en nuestras sucursales"},
                "SI, estoy ocupado": {"mensaje":"Muchas gracias por su tiempo, nos comunicaremos nuevamente"},
                "NO, no se encuentra": {"mensaje":"Tenemos una información comercial importante de sus productos del Banco Falabela, que se comunique con Banco Falabela o directamente en nuestras sucursales"},
                "NO, equivocado": {"mensaje":"Disculpe las molestias. Que tenga buen día . Gracias por su atención"},
                "NO, falleció": {"mensaje":"Lamentamos la situación. Por favor comunicarse al 6003906000. Gracias por su atención."},
                "Si, Un Momento por Favor": {"mensaje":"Gracias…", "ir a":"FlujoInicial"}
            }
        }
    },
    "Modulo_2": {
        "Introducción": {
            "mensaje": "Lo estoy llamando de SMARTIA por encargo de Banco Falabella. <NOMBRE>, el motivo de mi llamada es para informarle que al día de hoy usted mantiene una deuda de <MONTO_DEUDA>. El plazo para regularizar esta deuda es hasta el <FECHA>. ¿Está de acuerdo?",
            "objetivo": "Presentarse, entregar información de la deuda y validar la disposición del cliente para realizar el pago",
            "opciones": {
                "SI": {"ir a":"ComprometePago"},
                "NO": {"ir a":"PreguntarMotivo"},
                "Cliente compromete fecha dentro del plazo": {"ir a":"RegistrarCompromiso"},
                "Cliente compromete fecha fuera del plazo": {"ir a":"ExplicarPolíticas"},
            }
        },
        "ComprometePago": {
            "mensaje": "Perfecto, dejaré registrado entonces su compromiso de pago para el <FECHA>. Recuerde que el pago se puede hacer de forma presencial en caja banco Falabela y o a través de botón de pago en la pagina web de recsa punto ce ele. ¡Que tenga un excelente día!",
            "objetivo": "Registrar el compromiso de pago y explicar las opciones de pago",
            "fin": True
        },
        "PreguntarMotivo": {
            "mensaje": "Entiendo que no pueda pagar. ¿Podría indicarme cuál es el motivo?",
            "objetivo": "Identificar el motivo de la negativa y ofrecer alternativa de Plan de Pago",
            "opciones": {
                "Respuesta motivo de no pago": {"ir a":"OfrecerPlan"}
            }
        },
        "OfrecerPlan": {
            "mensaje": "Entiendo y pensando en su tranquilidad, Banco falabela, disponibiliza un plan de pago ¿Le gustaría saber más?",
            "objetivo": "Ofrecer un plan de pago para regularizar la deuda",
            "opciones": {
                "Acepta": {"ir a":"ExplicarPlan"},
                "No acepta": {"ir a":"AgradecerYFinalizar"}
            }
        },
        "ExplicarPlan": {
            "mensaje": "Esta campaña es una excelente alternativa, que le permite regularizar su deuda con un pago inicial del <PORCENTAJE> porciento, que le permitirá regularizar su saldo en <CUOTAS>, con pagos de <MONTO_CUOTAS>. Esta oferta es válida solo hasta <FECHA_LIMITE>. ¿Accede al beneficio hoy?",
            "objetivo": "Explicar el plan de pago y solicitar la aceptación",
            "opciones": {
                "Acepta": {"ir a":"ExplicaActivacionPlan"},
                "No acepta": {"ir a":"Finalizar"}
            }
        },
        "ExplicaActivacionPlan": {
            "mensaje": "Para activar el PLAN DE PAGO, debe realizar el pagar  pago inicial y luego presentarse en una sucursal de Banco falabela después de 2 días hábiles de haber realizado el pago inicial, con su cédula de identidad vigente y un comprobante de domicilio con una vigencia de 30 días. El pago se puede hacer de forma presencial en la caja banco Falabella y o a través de botón de pago en la pagina web de recsa punto ce ele. Mucha gracias por su tiempo. Que tenga un excelente día.",
            "objetivo": "Explicar los pasos para activar el Plan de Pago",
            "fin": True
        },
        "AgradecerYFinalizar": {
            "mensaje": "Muchas gracias por su tiempo. Que tenga un excelente día.",
            "objetivo": "Finalizar la conversación amablemente",
            "fin": True
        },       
        "RegistrarCompromiso": {
            "mensaje": "Perfecto, dejaré registrado entonces su compromiso de pago para el <FECHA>. Recuerde que el pago se puede hacer de forma presencial en caja banco Falabela y o a través de botón de pago en la pagina web de recsa punto ce ele. Que tenga un excelente día.",
            "objetivo": "Registrar el compromiso de pago y explicar las opciones de pago",
            "fin": True
        },
        "ExplicarPolíticas": {
            "mensaje": "Disculpe, pero las políticas de falabela solo permiten establecer el <FECHA_LIMITE> como fecha máxima de pago. ¿Puede comprometer un pago en este plazo?",
            "objetivo": "Explicar las políticas de pago y solicitar un compromiso de pago",
            "opciones": {
                "SI": {"ir a":"ComprometePago"},
                "NO": {"ir a":"PreguntarMotivo"},
                "Cliente compromete fecha dentro del plazo": {"ir a":"RegistrarCompromiso"},
                "Cliente compromete fecha fuera del plazo": {"ir a":"ExplicarPolíticas"},
            }
        },                
        "Finalizar": {
            "mensaje": "Gracias por su atención. Que tenga un buen día.",
            "objetivo": "Finalizar la conversación amablemente",
            "fin": True
        }
    }
}

</definicion_flujo>

# user: inicio_llamada
# assistant: Buenos días. Soy su agente virtual de Banco Falabela. ¿Hablo con Matías Pereira?
# user: Si quien habla?
# assistant: """




base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=False,
    return_dict=True,
    torch_dtype=torch.float16,
    #device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload().to("cuda")

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.00it/s]


In [41]:
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
print(input_ids)
outputs = model.generate(**input_ids, max_length=2048)
print(tokenizer.decode(outputs[0]))

{'input_ids': tensor([[     2, 235345,   1812,  ...,  20409, 235292, 235248]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')}
<bos># system:
Eres un asistente virtual para el Banco Falabela, especializado en procesos de cobranza. Sigues un diagrama de flujo estructurado para comunicarte con los clientes. Tu objetivo es guiar la conversación basándote en las etapas del flujo a continuación. Sé flexible para adaptarte si el usuario se desvía del flujo, pero siempre intenta regresar al proceso principal.

En el flujo se definirán opciones, considera detectar las intenciones del usuario y responder de acuerdo con las opciones disponibles. Si el usuario se desvía, intenta redirigirlo al flujo principal. Recuerda confirmar los datos proporcionados y guiar al usuario hacia un cierre adecuado.

<parametros>
NOMBRE: Matias
APELLIDO1: Pereira
APELLIDO2: Santelices
MONTO_DEUDA: 100000
FECHA: 6 enero 2025
FECHA_LIMITE: 31 enero 2025
PORCENTAJE: 10
P