# Lab: LLMs Generativos - Prompt Engineering, LoRA, y Quantization

## Setup Inicial

In [None]:
# Instalación de dependencias
!pip install -q transformers datasets peft accelerate bitsandbytes trl

In [None]:
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments,
    BitsAndBytesConfig
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
import pandas as pd
import matplotlib.pyplot as plt
import time
import psutil
import os

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## Parte 1: Prompt Engineering y Formato de Entrenamiento

### 1.1 Cargar Modelo Base

In [None]:
model_name = "tiiuae/Falcon3-1B-Base"

# Cargar tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Cargar modelo base (sin quantization por ahora)
model_base = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

print(f"Modelo cargado: {model_name}")
print(f"Parámetros: {model_base.num_parameters():,}")

### 1.2 Función para Generar Texto

In [None]:
def generate_text(model, prompt, max_length=200, temperature=0.7):
    """Genera texto usando el modelo"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
  
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=max_length,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            attention_mask=inputs.attention_mask
        )
  
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text[len(prompt):]  # Solo la parte generada

### 1.3 Comparación de Prompts: Raw vs Structured

In [None]:
# Pregunta técnica simple
question = "Explica qué es overfitting en machine learning"

print("=== PROMPT RAW (Modelo Base) ===")
raw_prompt = question
response_raw = generate_text(model_base, raw_prompt)
print(f"Input: {raw_prompt}")
print(f"Output: {response_raw}\n")

print("=== PROMPT ESTRUCTURADO ===")
structured_prompt = f"""Pregunta: {question}
Respuesta detallada:"""
response_structured = generate_text(model_base, structured_prompt)
print(f"Input: {structured_prompt}")
print(f"Output: {response_structured}\n")

print("=== FORMATO INSTRUCTION (Como en el entrenamiento) ===")
instruction_prompt = f"""### Instrucción:
{question}

### Respuesta:"""
response_instruction = generate_text(model_base, instruction_prompt)
print(f"Input: {instruction_prompt}")
print(f"Output: {response_instruction}")

## Parte 2: LoRA Fine-tuning

### 2.1 Preparar Dataset

In [None]:
# Cargar dataset Alpaca
dataset = load_dataset("tatsu-lab/alpaca")
print(f"Dataset original: {len(dataset['train'])} ejemplos")

# Tomar una muestra pequeña para el lab
train_dataset = dataset['train'].select(range(1000))  # Solo 1000 ejemplos
print(f"Dataset para entrenamiento: {len(train_dataset)} ejemplos")

# Ver algunos ejemplos
for i in range(3):
    example = train_dataset[i]
    print(f"Ejemplo {i+1}:")
    print(f"Instrucción: {example['instruction'][:100]}...")
    print(f"Input: {example['input'][:50]}..." if example['input'] else "Input: (vacío)")
    print(f"Output: {example['output'][:100]}...")
    print("-" * 50)

### 2.2 Formato del Dataset para SFT


In [None]:
def format_instruction(sample):
    """Formatear ejemplo en estilo instruction-following"""
    if sample['input']:
        prompt = f"""### Instrucción:
{sample['instruction']}

### Input:
{sample['input']}

### Respuesta:
{sample['output']}"""
    else:
        prompt = f"""### Instrucción:
{sample['instruction']}

### Respuesta:
{sample['output']}"""
  
    return {"text": prompt}

# Formatear dataset
formatted_dataset = train_dataset.map(format_instruction)
print("Ejemplo formateado:")
print(formatted_dataset[0]['text'][:300] + "...")

### 2.3 Configurar LoRA

In [None]:
# Configuración LoRA - ajustada para Falcon
lora_config = LoraConfig(
    r=8,                          # rank - qué tan 'grande' es la adaptación
    lora_alpha=32,               # scaling factor
    target_modules=[             # módulos específicos para Falcon
        "query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"
    ],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

print("Configuración LoRA:")
print(f"Rank (r): {lora_config.r}")
print(f"Alpha: {lora_config.lora_alpha}")
print(f"Módulos objetivo: {lora_config.target_modules}")

### 2.4 Preparar Modelo para LoRA

In [None]:
# Crear modelo con LoRA
model_lora = get_peft_model(model_base, lora_config)

# Estadísticas del modelo
trainable_params = sum(p.numel() for p in model_lora.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model_lora.parameters())

print(f"Parámetros totales: {total_params:,}")
print(f"Parámetros entrenables (LoRA): {trainable_params:,}")
print(f"Porcentaje entrenable: {100 * trainable_params / total_params:.2f}%")

### 2.5 Configurar Entrenamiento

In [None]:
# Argumentos de entrenamiento - configuración rápida
training_args = TrainingArguments(
    output_dir="./gemma-lora-alpaca",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,              # Solo 1 época para el lab
    learning_rate=2e-4,
    logging_steps=25,
    save_steps=500,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    warmup_steps=50,
    fp16=True,                       # Usar FP16 para velocidad
    remove_unused_columns=False,
    dataloader_pin_memory=False,
)

print("Configuración de entrenamiento:")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Grad accumulation: {training_args.gradient_accumulation_steps}")
print(f"Épocas: {training_args.num_train_epochs}")
print(f"Learning rate: {training_args.learning_rate}")

### 2.6 Entrenar Modelo

In [None]:
# Crear trainer
trainer = SFTTrainer(
    model=model_lora,
    train_dataset=formatted_dataset,
    tokenizer=tokenizer,
    args=training_args,
    dataset_text_field="text",
    max_seq_length=512,
    packing=False,
)

print("Iniciando entrenamiento LoRA...")
start_time = time.time()

# Entrenar
trainer.train()

end_time = time.time()
print(f"Entrenamiento completado en {end_time - start_time:.2f} segundos")

# Guardar adapters LoRA
model_lora.save_pretrained("./falcon-lora-adapters")
print("Adapters LoRA guardados")

### 2.7 Comparar Antes/Después del Fine-tuning

In [None]:
# Preguntas de evaluación
test_questions = [
    "¿Qué es machine learning?",
    "Explica la diferencia entre supervised y unsupervised learning",
    "¿Cómo funciona un algoritmo de regresión lineal?",
    "Dame consejos para evitar overfitting"
]

print("=== COMPARACIÓN: MODELO BASE vs MODELO CON LoRA ===\n")

for i, question in enumerate(test_questions):
    prompt = f"""### Instrucción:
{question}

### Respuesta:"""
  
    print(f"PREGUNTA {i+1}: {question}")
    print("-" * 60)
  
    # Modelo base
    print("MODELO BASE:")
    response_base = generate_text(model_base, prompt, max_length=150)
    print(response_base[:200] + ("..." if len(response_base) > 200 else ""))
    print()
  
    # Modelo con LoRA
    print("MODELO + LoRA:")
    response_lora = generate_text(model_lora, prompt, max_length=150)
    print(response_lora[:200] + ("..." if len(response_lora) > 200 else ""))
    print("=" * 80 + "\n")

## Parte 3: Quantization

### 3.1 Función para Medir Memoria

In [None]:
def get_memory_usage():
    """Obtener uso de memoria GPU y RAM"""
    gpu_memory = torch.cuda.memory_allocated() / 1e9 if torch.cuda.is_available() else 0
    ram_memory = psutil.Process().memory_info().rss / 1e9
    return gpu_memory, ram_memory

def benchmark_model(model, prompt, num_runs=3):
    """Benchmark de velocidad e inferencia"""
    times = []
  
    for _ in range(num_runs):
        start_time = time.time()
        _ = generate_text(model, prompt, max_length=100)
        end_time = time.time()
        times.append(end_time - start_time)
  
    return sum(times) / len(times)

### 3.2 Cargar Modelos con Diferentes Quantizaciones

In [None]:
# Limpiar memoria
del model_base
del model_lora
torch.cuda.empty_cache()

models = {}
memory_usage = {}

# 1. Modelo FP16 (baseline)
print("Cargando modelo FP16...")
models['fp16'] = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
gpu_mem, ram_mem = get_memory_usage()
memory_usage['fp16'] = {'gpu': gpu_mem, 'ram': ram_mem}
print(f"Memoria GPU: {gpu_mem:.2f} GB, RAM: {ram_mem:.2f} GB")

# 2. Modelo 8-bit
print("\nCargando modelo 8-bit...")
quantization_config_8bit = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16
)

models['8bit'] = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config_8bit,
    device_map="auto"
)
gpu_mem, ram_mem = get_memory_usage()
memory_usage['8bit'] = {'gpu': gpu_mem, 'ram': ram_mem}
print(f"Memoria GPU: {gpu_mem:.2f} GB, RAM: {ram_mem:.2f} GB")

# 3. Modelo 4-bit
print("\nCargando modelo 4-bit...")
quantization_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

models['4bit'] = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config_4bit,
    device_map="auto"
)
gpu_mem, ram_mem = get_memory_usage()
memory_usage['4bit'] = {'gpu': gpu_mem, 'ram': ram_mem}
print(f"Memoria GPU: {gpu_mem:.2f} GB, RAM: {ram_mem:.2f} GB")

### 3.3 Comparar Velocidad

In [None]:
print("\n=== BENCHMARK DE VELOCIDAD ===")
test_prompt = "Explica qué es deep learning en términos simples:"

benchmark_results = {}
for model_type, model in models.items():
    print(f"Benchmarking {model_type}...")
    avg_time = benchmark_model(model, test_prompt)
    benchmark_results[model_type] = avg_time
    print(f"Tiempo promedio: {avg_time:.2f} segundos")

### 3.4 Comparar Calidad de Output

In [None]:
print("\n=== COMPARACIÓN DE CALIDAD ===")
test_prompt = """### Instrucción:
Explica la diferencia entre bias y variance en machine learning

### Respuesta:"""

for model_type, model in models.items():
    print(f"\n{model_type.upper()}:")
    print("-" * 40)
    response = generate_text(model, test_prompt, max_length=150)
    print(response[:300] + ("..." if len(response) > 300 else ""))

### 3.5 Visualizar Comparaciones

In [None]:
# Gráfico de memoria
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
model_types = list(memory_usage.keys())
gpu_memory = [memory_usage[mt]['gpu'] for mt in model_types]
plt.bar(model_types, gpu_memory, color=['blue', 'orange', 'green'])
plt.title('Uso de Memoria GPU')
plt.ylabel('GB')
plt.ylim(0, max(gpu_memory) * 1.2)

# Gráfico de velocidad
plt.subplot(1, 3, 2)
speeds = [benchmark_results[mt] for mt in model_types]
plt.bar(model_types, speeds, color=['blue', 'orange', 'green'])
plt.title('Tiempo de Inferencia')
plt.ylabel('Segundos')

# Gráfico comparativo
plt.subplot(1, 3, 3)
reduction_gpu = [(memory_usage['fp16']['gpu'] - memory_usage[mt]['gpu']) / memory_usage['fp16']['gpu'] * 100 
                 for mt in model_types]
plt.bar(model_types, reduction_gpu, color=['blue', 'orange', 'green'])
plt.title('Reducción de Memoria vs FP16')
plt.ylabel('% Reducción')
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)

plt.tight_layout()
plt.show()

# Tabla resumen
print("\n=== TABLA RESUMEN ===")
df = pd.DataFrame({
    'Modelo': model_types,
    'GPU Memory (GB)': [memory_usage[mt]['gpu'] for mt in model_types],
    'Tiempo Inferencia (s)': [benchmark_results[mt] for mt in model_types],
    'Reducción Memoria (%)': reduction_gpu
})
print(df.round(2))

## Recursos Adicionales
- **Transformers**: https://huggingface.co/docs/transformers/
- **PEFT (LoRA)**: https://huggingface.co/docs/peft/
- **BitsAndBytesConfig**: https://huggingface.co/docs/transformers/main_classes/quantization
- **Alpaca Dataset**: https://huggingface.co/datasets/tatsu-lab/alpaca

## Limpieza Final

In [None]:
# Limpiar memoria
for model in models.values():
    del model
torch.cuda.empty_cache()
print("Memoria limpiada")