# Entrenamiento con GPU - Versi√≥n Optimizada
**Nota:** Este notebook est√° optimizado exclusivamente para GPU con cuantizaci√≥n 4-bit y AMP.

## Librer√≠as

In [None]:
import os
import json
import re
import math
import random
from typing import Any, Dict, List, Optional
import csv
import time
import datetime as dt

import torch
from torch.utils.data import DataLoader, Dataset as TorchDataset
from torch.optim import AdamW
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import torch.utils.checkpoint
from tqdm.auto import tqdm

import matplotlib.pyplot as plt
import numpy as np

# Importar archivos .py personalizados
import evaluation_metric as custom_metrics
import shared_functions as custom_sharfun

from importlib.metadata import version

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    get_scheduler
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import logging as hf_logging
hf_logging.set_verbosity_warning()

In [None]:
start_time = time.time()
print("Inicio de ejecuci√≥n:", dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

In [None]:
# Versiones utilizadas
librerias = [
    "numpy",
    "matplotlib",
    "torch",
    "tqdm",
    "datasets",
    "transformers",
    "peft",
    "importlib-metadata"
]
for library in librerias:
    print(library, ": ", version(library))

!python --version

## Configuraci√≥n GPU

In [None]:
# Verificar GPU
if not torch.cuda.is_available():
    raise RuntimeError("‚ö†Ô∏è GPU no disponible. Este notebook requiere GPU para ejecutarse.")

DEVICE = "cuda"
print(f"\n{'='*60}")
print(f"üöÄ ENTRENAMIENTO CON GPU")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Memoria GPU disponible: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"{'='*60}\n")

In [None]:
# Configuraci√≥n base
TOTAL_FILES_TO_TRAIN = 9
DATA_PATH = "data/train"
OUTPUT_DIR = "output/results/v01"
EXPECTED_JSON_FILE = "data/template/expected_output.json"
EXPECTED_JSON = None
os.makedirs(OUTPUT_DIR, exist_ok=True)

MODEL_NAME = "Qwen/Qwen3-0.6B-Base"

# Seeds
GLB_SEED = 42
torch.manual_seed(GLB_SEED)
random.seed(GLB_SEED)
np.random.seed(GLB_SEED)
torch.cuda.manual_seed_all(GLB_SEED)

# Par√°metros comunes
TEST_SIZE = 0.2
MAX_LENGTH = 1252

# Par√°metros optimizados para GPU
BATCH_SIZE = 8
GRAD_ACCUM_STEPS = 2
EPOCHS = 5
WARMUP_RATIO = 0.03
LEARNING_RATE = 2e-4
WEIGHT_DECAY = 0.01
BETAS = (0.9, 0.999)
EPS = 1e-8
SCHEDULER_TYPE = "linear"
CLIP_NORM = 1.0

# LoRA - Configuraci√≥n agresiva para GPU
LORA_R = 64
LORA_ALPHA = 128
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

# Generaci√≥n
NUM_VAL_EXAMPLES = 45
GEN_MAX_NEW_TOKENS = 377
BEAM_CANDIDATES = 5

# Imprimir configuraci√≥n
print(f"\nüìä CONFIGURACI√ìN GPU:")
print(f"  - Batch Size: {BATCH_SIZE}")
print(f"  - Gradient Accumulation Steps: {GRAD_ACCUM_STEPS}")
print(f"  - Epochs: {EPOCHS}")
print(f"  - Learning Rate: {LEARNING_RATE}")
print(f"  - LoRA R: {LORA_R}")
print(f"  - LoRA Alpha: {LORA_ALPHA}")
print(f"  - Target Modules: {TARGET_MODULES}")
print(f"  - Cuantizaci√≥n 4-bit: ‚úì Activada")
print(f"  - Mixed Precision (AMP): ‚úì Activada")
print()

## Cargar datos

In [None]:
# Cargar archivo ejemplo JSON esperado
if os.path.exists(EXPECTED_JSON_FILE):
    with open(EXPECTED_JSON_FILE, "r", encoding="utf-8") as f:
        EXPECTED_JSON = json.load(f)
    print(f"‚úÖ Archivo template cargado: {EXPECTED_JSON_FILE}")
else:
    print(f"‚ö†Ô∏è Archivo template no encontrado: {EXPECTED_JSON_FILE}")

In [None]:
# Cargar datos de entrenamiento
raw_data = []
files = sorted([f for f in os.listdir(DATA_PATH) if f.endswith(".json")])[:TOTAL_FILES_TO_TRAIN]

for file_name in files:
    file_path = os.path.join(DATA_PATH, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if isinstance(data, list):
            raw_data.extend(data)
        elif isinstance(data, dict):
            raw_data.append(data)
        else:
            raise ValueError(f"Formato no esperado en {file_name}")

print(f"Total training files loaded: {len(raw_data)}")

clean_data = []
dropped = 0
for item in raw_data:
    natural = item.get("natural_language")
    json_d = item.get("json_data")
    if natural is None or natural == "" or json_d is None:
        dropped += 1
        continue
    clean_data.append(item)

print(f"Registros v√°lidos despu√©s de limpiar: {len(clean_data)} (eliminados: {dropped})")

In [None]:
# Convertir a Hugging Face Dataset
hf_dataset = Dataset.from_list(clean_data)

# Split train / validation
split = hf_dataset.train_test_split(test_size=TEST_SIZE, seed=GLB_SEED)
train_list = split['train']
val_list = split['test']

print(f"Train examples: {len(train_list)}, Val examples: {len(val_list)}")

In [None]:
# Distribuci√≥n de longitudes
longitudes = [len(item["natural_language"]) for item in clean_data if item.get("natural_language") is not None]
plt.figure(figsize=(8, 4))
plt.hist(longitudes, bins=20)
plt.title("Distribuci√≥n de longitudes de 'natural_language'")
plt.xlabel("Longitud del texto")
plt.ylabel("Cantidad de ejemplos")
plt.grid(True)
plt.show()

## Cargar modelo y tokenizer (GPU con cuantizaci√≥n 4-bit)

In [None]:
print("Cargando tokenizer y modelo con cuantizaci√≥n 4-bit...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Configuraci√≥n 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Preparar para k-bit training
model = prepare_model_for_kbit_training(model)

# Desactivar cache y habilitar checkpointing
model.config.use_cache = False
try:
    model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
except TypeError:
    model.gradient_checkpointing_enable()

# Configurar LoRA
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=TARGET_MODULES,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## An√°lisis de longitudes de tokens

In [None]:
lengths_tokens = []
for ex in clean_data:
    txt = custom_sharfun.build_training_example(ex)
    enc = tokenizer(txt, truncation=False, padding=False)
    lengths_tokens.append(len(enc["input_ids"]))

print("Percentiles de longitudes (90, 95, 99):")
print(np.percentile(lengths_tokens, [90, 95, 99]))

In [None]:
def medir_longitudes_tokens(dataset, tokenizer, max_ejemplos=None):
    prompt_lens = []
    json_lens = []
    full_lens = []
    
    for i, ex in enumerate(dataset):
        if max_ejemplos is not None and i >= max_ejemplos:
            break
        
        natural = ex["natural_language"]
        target_json_str = json.dumps(ex["json_data"], ensure_ascii=False)
        prompt = custom_sharfun.build_prompt(natural)
        full_text = prompt + target_json_str
        
        enc_prompt = tokenizer(prompt, truncation=False, padding=False, add_special_tokens=True)
        enc_full = tokenizer(full_text, truncation=False, padding=False, add_special_tokens=True)
        
        lp = len(enc_prompt["input_ids"])
        lf = len(enc_full["input_ids"])
        lj = lf - lp
        
        prompt_lens.append(lp)
        full_lens.append(lf)
        json_lens.append(lj)
    
    stats = {
        "prompt_mean": float(np.mean(prompt_lens)),
        "prompt_p95": float(np.percentile(prompt_lens, 95)),
        "prompt_p99": float(np.percentile(prompt_lens, 99)),
        "json_mean": float(np.mean(json_lens)),
        "json_p95": float(np.percentile(json_lens, 95)),
        "json_p99": float(np.percentile(json_lens, 99)),
        "full_mean": float(np.mean(full_lens)),
        "full_p95": float(np.percentile(full_lens, 95)),
        "full_p99": float(np.percentile(full_lens, 99)),
        "full_max": int(np.max(full_lens)),
    }
    return stats

stats_val = medir_longitudes_tokens(val_list, tokenizer, max_ejemplos=None)
print("Estad√≠sticas de validaci√≥n:")
print(stats_val)

## Preparar DataLoaders

In [None]:
# Tokenizar datos
train_tokens = [
    custom_sharfun.tokenize_example_textpair(
        custom_sharfun.build_training_example(x), 
        MAX_LENGTH, 
        tokenizer, 
        padding=False
    ) for x in train_list
]
val_tokens = [
    custom_sharfun.tokenize_example_textpair(
        custom_sharfun.build_training_example(x), 
        MAX_LENGTH, 
        tokenizer, 
        padding=False
    ) for x in val_list
]

# Simple Dataset
class SimpleTorchDataset(TorchDataset):
    def __init__(self, tokens_list):
        self.data = tokens_list
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return {k: v for k, v in self.data[idx].items()}

train_dataset = SimpleTorchDataset(train_tokens)
val_dataset = SimpleTorchDataset(val_tokens)

def collate_fn(batch):
    input_ids = [b['input_ids'] for b in batch]
    attention_mask = [b['attention_mask'] for b in batch]
    labels = [b['labels'] for b in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

## Configurar optimizador y scheduler

In [None]:
total_steps_per_epoch = math.ceil(len(train_loader) / GRAD_ACCUM_STEPS)
total_training_steps = EPOCHS * total_steps_per_epoch
num_warmup_steps = int(total_training_steps * WARMUP_RATIO)

print(f"Total training steps: {total_training_steps}, Warmup steps: {num_warmup_steps}")

trainable_params = [p for p in model.parameters() if p.requires_grad]
optimizer = AdamW(trainable_params, lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY, betas=BETAS, eps=EPS)

scheduler = get_scheduler(
    name=SCHEDULER_TYPE, 
    optimizer=optimizer, 
    num_warmup_steps=num_warmup_steps, 
    num_training_steps=total_training_steps
)

## Entrenamiento con AMP (GPU)

In [None]:
# Configurar scaler para AMP
print("üöÄ Usando Mixed Precision Training (AMP) con GPU")
scaler = torch.amp.GradScaler(device='cuda')

model.to(DEVICE)
model.train()

global_step = 0
optimizer.zero_grad()

for epoch in range(EPOCHS):
    running_loss = 0.0
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{EPOCHS}")
    optimizer.zero_grad()
    
    for step, batch in pbar:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        # Forward pass con AMP
        with torch.amp.autocast(device_type='cuda'):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / GRAD_ACCUM_STEPS
        
        # Backward pass con scaler
        scaler.scale(loss).backward()
        running_loss += loss.item() * GRAD_ACCUM_STEPS

        # Update block
        if (step + 1) % GRAD_ACCUM_STEPS == 0 or (step + 1) == len(train_loader):
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1

            avg_loss = running_loss / (step + 1)
            pbar.set_postfix({'loss': f"{avg_loss:.4f}", 'lr': scheduler.get_last_lr()[0]})

    print(f"Epoch {epoch+1} finished ‚Äî avg loss: {running_loss / len(train_loader):.4f}")

## Guardar modelo

In [None]:
OUTPUT_DIR_MODEL = os.path.join(OUTPUT_DIR, "modfinal")
os.makedirs(OUTPUT_DIR_MODEL, exist_ok=True)

model.save_pretrained(OUTPUT_DIR_MODEL)
tokenizer.save_pretrained(OUTPUT_DIR_MODEL)

print('Modelo guardado en', OUTPUT_DIR_MODEL)

In [None]:
CKPT_PATH = os.path.join(OUTPUT_DIR_MODEL, "weights.pt")
torch.save(
    {
        "model_state_dict": model.state_dict(),
        "tokenizer": tokenizer.__dict__,
    },
    CKPT_PATH
)
print("Checkpoint .pt guardado en:", CKPT_PATH)

## Evaluaci√≥n

In [None]:
print("Tama√±o conjunto de validaci√≥n:", len(val_list))
NUM_VAL_EXAMPLES = len(val_list)

results = []
for idx, ex in enumerate(
    tqdm(val_list.select(range(min(NUM_VAL_EXAMPLES, len(val_list)))), desc="Eval")
):
    text = ex["natural_language"]
    raw = custom_sharfun.generate_json_raw(
        text=text, 
        max_new_tokens=GEN_MAX_NEW_TOKENS, 
        max_length=MAX_LENGTH, 
        tokenizer=tokenizer, 
        model=model, 
        device=DEVICE
    )
    pred_obj = custom_sharfun.extract_json_from_text(raw)
    true_json = ex["json_data"]

    if pred_obj is None:
        f1 = 0.0
    else:
        try:
            f1 = custom_metrics.evaluate_json(true_json, json.dumps(pred_obj, ensure_ascii=False))
        except Exception:
            f1 = float(1.0 if pred_obj == true_json else 0.0)

    print(f"Ejemplo {idx}: F1 = {f1:.4f}")
    results.append({"idx": idx, "f1": f1, "raw": raw, "pred": pred_obj, "true": true_json})

In [None]:
# Gr√°fico de F1 scores
f1_scores = [r['f1'] for r in results]
plt.figure(figsize=(8, 4))
plt.plot(range(1, len(f1_scores) + 1), f1_scores, marker='o')
plt.title("F1 Scores por Ejemplo de Validaci√≥n")
plt.xlabel("Ejemplo de Validaci√≥n")
plt.ylabel("F1 Score")
plt.grid(True)
plt.show()

In [None]:
# Guardar resultados
OUTPUT_DIR_VAL = os.path.join(OUTPUT_DIR, "result_validation")
os.makedirs(OUTPUT_DIR_VAL, exist_ok=True)
csv_path = os.path.join(OUTPUT_DIR_VAL, 'validation_results.csv')

with open(csv_path, 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['idx','f1','raw','pred','true'])
    writer.writeheader()
    for r in results:
        writer.writerow({
            'idx': r['idx'],
            'f1': r['f1'],
            'raw': r['raw'],
            'pred': json.dumps(r['pred'], ensure_ascii=False),
            'true': json.dumps(r['true'], ensure_ascii=False)
        })
print('CSV guardado en', csv_path)

In [None]:
# Histograma F1
f1_scores = [r['f1'] for r in results]
plt.figure()
plt.hist(f1_scores, bins=10)
plt.title('Distribuci√≥n de F1')
plt.xlabel('F1')
plt.ylabel('Frecuencia')
plt.savefig(os.path.join(OUTPUT_DIR, 'f1_distribution.png'))
plt.show()
plt.close()
print('Histograma guardado en', os.path.join(OUTPUT_DIR, 'f1_distribution.png'))

In [None]:
# Mostrar peores 3 ejemplos
sorted_by_f1 = sorted(results, key=lambda x: x['f1'])
print('\nPeores 3 ejemplos:')
for r in sorted_by_f1[:3]:
    print(f"Ejemplo #{r['idx']} - F1 Score: {r['f1']}")
    print('Texto:', r['raw'])
    print("*"*90)
    print('Pred_normalizado:', r['pred'])
    print('True:', r['true'])
    print('-'*150)

In [None]:
end_time = time.time()
elapsed_sec = end_time - start_time
elapsed_min = elapsed_sec / 60

print(f"Fin de ejecuci√≥n: {dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Tiempo total: {elapsed_sec:.1f} segundos (~{elapsed_min:.2f} minutos)")
print(f"\n{'='*60}")
print(f"‚úÖ Entrenamiento completado con GPU")
print(f"{'='*60}")