# Version - Versi√≥n Light
Sin documentaci√≥n solo referencia ajustar y realizar pruebas.
Archivo para entrenar y validar contiene en total 222 ejemplos.

# Librer√≠as

In [1]:
import os
import json
import re
import math
from typing import Any, Dict, List
import csv

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import numpy as np
import evaluation_metric as custom_metrics
import torch.utils.checkpoint

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    get_scheduler
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import logging as hf_logging
hf_logging.set_verbosity_warning()

# Variables globales y funciones auxiliares

In [None]:
# Directorios
DATA_PATH = "data/smalltrain/natural_purchase_order_6.json" # por ahora tomo este ejemplo light de 222 registros
OUTPUT_DIR = "output/qwen_04"
EXPECTED_JSON_FILE = "data/template/expected_output.json"
EXPECTED_JSON = None
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Modelo
MODEL_NAME = "Qwen/Qwen3-0.6B-Base"
# Data
TEST_SIZE = 0.2
# Tokenizaci√≥n
MAX_LENGTH = 1024
# Dataloader
BATCH_SIZE = 1
# Entrenamiento
GRAD_ACCUM_STEPS = 4
EPOCHS = 3
LEARNING_RATE = 2e-4
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 10
# Evaluaci√≥n
NUM_VAL_EXAMPLES = 45
# Configuraci√≥n del dispositivo
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
GLB_SEED = 42
torch.manual_seed(GLB_SEED)
np.random.seed(GLB_SEED)
if DEVICE == "cuda":
    torch.cuda.manual_seed_all(GLB_SEED)

Using device: cuda


# Carga de archivos y datos

Cargar archivo ejemplo JSON esperado para predicci√≥n

In [3]:
with open(EXPECTED_JSON_FILE, "r", encoding="utf-8") as f:
    EXPECTED_JSON = json.load(f)

In [5]:
#EXPECTED_JSON

Cargar dataset   
Esto es temporal mientras se ajusta el F1-Score para aquellos que estan dando cero. Temporal en cargar solo un archivo, dado que se deben cargar todos los JSON para entrenar.

In [6]:
with open(DATA_PATH, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# convert to Hugging Face Dataset
hf_dataset = Dataset.from_list(raw_data)

# split train / validation
split = hf_dataset.train_test_split(test_size=TEST_SIZE, seed=GLB_SEED)
train_list = split['train']
val_list = split['test']

print(f"Train examples: {len(train_list)}, Val examples: {len(val_list)}")

Train examples: 177, Val examples: 45


In [7]:
print("Cargando tokenizer y modelo (esto puede tardar)...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# preparar para k-bit training
model = prepare_model_for_kbit_training(model)

# Desactivar cache y habilitar checkpointing seguro
model.config.use_cache = False
try:
    model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
except TypeError:
    # fallback if the signature is different
    model.gradient_checkpointing_enable()

torch.utils.checkpoint.use_reentrant = False

Cargando tokenizer y modelo (esto puede tardar)...


In [8]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 2,293,760 || all params: 598,343,680 || trainable%: 0.3834


## Construcci√≥n del prompt para que genere el JSON

In [11]:
def build_prompt(natural_text: str) -> str:
    instructions = (
        "Eres un extractor de √≥rdenes. Genera SOLO un JSON v√°lido EXACTAMENTE con los campos requeridos.\n"
        "Reglas:\n"
        "- Usa null cuando un campo no exista.\n"
        "- \"buyer\" debe existir; si name/email/contact/addresses faltan, d√©jalos en null.\n"
        "- Si addresses est√° vac√≠o o no existe -> \"addresses\": null.\n"
        "- Si purchases est√° vac√≠o o no existe -> \"purchases\": null.\n"
        "- shipping es opcional; si falta -> \"shipping\": null.\n"
        "- Asegura que los tipos principales sean correctos (quantity entero, country uno de US/CA/GB/ES/CO/DE/FR).\n\n"
    )
    prompt = instructions + "Texto:\n" + natural_text + "\n\nJSON:\n"
    return prompt

In [12]:
def build_training_example(example: Dict[str, Any]) -> str:
    natural = example['natural_language']
    target_json = json.dumps(example['json_data'], ensure_ascii=False)
    prompt = build_prompt(natural)
    return prompt + target_json

In [13]:
# Tokenizaci√≥n (precompuesta) y construcci√≥n de datasets de tensores
def tokenize_example_textpair(textpair: str):
    enc = tokenizer(textpair, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors='pt')
    labels = enc['input_ids'].clone()
    labels[labels == tokenizer.pad_token_id] = -100
    return {'input_ids': enc['input_ids'].squeeze(0), 'attention_mask': enc['attention_mask'].squeeze(0), 'labels': labels.squeeze(0)}

train_tokens = [tokenize_example_textpair(build_training_example(x)) for x in train_list]
val_tokens = [tokenize_example_textpair(build_training_example(x)) for x in val_list]

In [14]:
class SimpleTorchDataset(torch.utils.data.Dataset):
    def __init__(self, tokens_list):
        self.data = tokens_list
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return {k: v for k, v in self.data[idx].items()}

train_dataset = SimpleTorchDataset(train_tokens)
val_dataset = SimpleTorchDataset(val_tokens)

In [15]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([b['input_ids'] for b in batch]),
        'attention_mask': torch.stack([b['attention_mask'] for b in batch]),
        'labels': torch.stack([b['labels'] for b in batch])
    }

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

In [16]:
# Optimizer AdamW custom + scheduler
trainable_params = [p for p in model.parameters() if p.requires_grad]
optimizer = AdamW(trainable_params, lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY, betas=(0.9,0.999), eps=1e-8)
num_update_steps_per_epoch = math.ceil(len(train_loader) / GRAD_ACCUM_STEPS)
max_train_steps = EPOCHS * num_update_steps_per_epoch
scheduler = get_scheduler(name='linear', optimizer=optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=max_train_steps)

# WandB opcional
try:
    import wandb
    wandb.init(project="qwen-json-extraction", reinit=True)
    use_wandb = True
except Exception:
    use_wandb = False

In [17]:
# Entrenamiento manual con AMP (torch.amp) y tqdm
scaler = torch.amp.GradScaler(device='cuda') if DEVICE.startswith('cuda') else torch.amp.GradScaler()
model.to(DEVICE)
model.train()

global_step = 0
for epoch in range(EPOCHS):
    running_loss = 0.0
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}")
    optimizer.zero_grad()
    for step, batch in pbar:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        with torch.amp.autocast(device_type='cuda' if DEVICE.startswith('cuda') else 'cpu'):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / GRAD_ACCUM_STEPS

        scaler.scale(loss).backward()
        running_loss += loss.item() * GRAD_ACCUM_STEPS

        if (step + 1) % GRAD_ACCUM_STEPS == 0 or (step + 1) == len(train_loader):
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
            global_step += 1

            avg_loss = running_loss / (step + 1)
            pbar.set_postfix({'loss': f"{avg_loss:.4f}", 'lr': scheduler.get_last_lr()[0]})
            if use_wandb:
                wandb.log({'train/loss': avg_loss, 'train/lr': scheduler.get_last_lr()[0], 'step': global_step})

    print(f"Epoch {epoch+1} finished ‚Äî avg loss: {running_loss / len(train_loader):.4f}")

Epoch 1:   0%|          | 0/177 [00:00<?, ?it/s]

Epoch 1 finished ‚Äî avg loss: 1.9302


Epoch 2:   0%|          | 0/177 [00:00<?, ?it/s]

Epoch 2 finished ‚Äî avg loss: 1.4729


Epoch 3:   0%|          | 0/177 [00:00<?, ?it/s]

Epoch 3 finished ‚Äî avg loss: 1.3985


In [18]:
# guardar adaptador LoRA
model.save_pretrained(os.path.join(OUTPUT_DIR, 'lora_adapter'))
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'tokenizer'))


('./qwen_04\\tokenizer\\tokenizer_config.json',
 './qwen_04\\tokenizer\\special_tokens_map.json',
 './qwen_04\\tokenizer\\chat_template.jinja',
 './qwen_04\\tokenizer\\vocab.json',
 './qwen_04\\tokenizer\\merges.txt',
 './qwen_04\\tokenizer\\added_tokens.json',
 './qwen_04\\tokenizer\\tokenizer.json')

In [19]:
# Generaci√≥n robusta y normalizaci√≥n POST-PREDICCI√ìN
def generate_json_raw(text: str, max_new_tokens: int = 256) -> str:
    prompt = build_prompt(text)
    enc = tokenizer(prompt, return_tensors='pt', truncation=True, padding=True, max_length=MAX_LENGTH).to(DEVICE)
    input_ids = enc['input_ids']
    attention_mask = enc['attention_mask']
    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id

    with torch.no_grad():
        out = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=pad_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=False
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

def extract_json_from_text(text: str) -> Any:
    matches = re.findall(r"\{[\s\S]*\}", text)
    if matches:
        for m in matches:
            candidate = m
            candidate_fixed = candidate.replace("'", '"')
            try:
                return json.loads(candidate_fixed)
            except Exception:
                continue
    if 'JSON:' in text:
        part = text.split('JSON:')[-1].strip()
        part = part.strip('`\n ')
        try:
            return json.loads(part)
        except Exception:
            try:
                return json.loads(part.replace("'", '"'))
            except Exception:
                return None
    return None

In [20]:
def normalize_example_json_pred(js: Dict[str, Any]) -> Dict[str, Any]:
    # Ensure buyer exists
    if js is None:
        return {
            'buyer': {'name': None, 'email': None, 'contact': None, 'addresses': None},
            'purchases': None,
            'shipping': None
        }

    out = {}
    buyer = js.get('buyer') if isinstance(js, dict) else None
    if not buyer:
        out['buyer'] = {'name': None, 'email': None, 'contact': None, 'addresses': None}
    else:
        # name & email
        out['buyer'] = {
            'name': buyer.get('name') if buyer.get('name') is not None else None,
            'email': buyer.get('email') if buyer.get('email') is not None else None,
            'contact': None,
            'addresses': None
        }
        contact = buyer.get('contact')
        if contact:
            out['buyer']['contact'] = {
                'phone': contact.get('phone') if contact.get('phone') is not None else None,
                'alt_email': contact.get('alt_email') if contact.get('alt_email') is not None else None,
                'preferred_contact': contact.get('preferred_contact') if contact.get('preferred_contact') is not None else None
            }
        else:
            out['buyer']['contact'] = None

        addresses = buyer.get('addresses')
        if addresses and isinstance(addresses, list) and len(addresses) > 0:
            # keep as-is but ensure required fields exist
            out_addrs = []
            for a in addresses:
                out_addrs.append({
                    'street': a.get('street') if a.get('street') is not None else None,
                    'city': a.get('city') if a.get('city') is not None else None,
                    'state': a.get('state') if a.get('state') is not None else None,
                    'postal_code': a.get('postal_code') if a.get('postal_code') is not None else None,
                    'country': a.get('country') if a.get('country') is not None else None
                })
            out['buyer']['addresses'] = out_addrs
        else:
            out['buyer']['addresses'] = None

    # purchases
    purchases = js.get('purchases') if isinstance(js, dict) else None
    if purchases and isinstance(purchases, list) and len(purchases) > 0:
        out_p = []
        for p in purchases:
            out_p.append({
                'product_name': p.get('product_name') if p.get('product_name') is not None else None,
                'quantity': int(p.get('quantity')) if (p.get('quantity') is not None and str(p.get('quantity')).isdigit()) else None,
                'currency': p.get('currency') if p.get('currency') is not None else None,
                'discount_code': p.get('discount_code') if p.get('discount_code') is not None else None
            })
        out['purchases'] = out_p
    else:
        out['purchases'] = None

    # shipping
    shipping = js.get('shipping') if isinstance(js, dict) else None
    if shipping:
        out['shipping'] = {
            'method': shipping.get('method') if shipping.get('method') is not None else None,
            'preferred_by': shipping.get('preferred_by') if shipping.get('preferred_by') is not None else None
        }
    else:
        out['shipping'] = None

    return out

In [21]:
NUM_VAL_EXAMPLES = 45

In [22]:
# Evaluaci√≥n sobre N ejemplos con barra de progreso, CSV y gr√°fico
results = []
for idx, ex in enumerate(tqdm(val_list.select(range(NUM_VAL_EXAMPLES)), desc='Evaluaci√≥n')):
    text = ex['natural_language']
    raw = generate_json_raw(text)
    pred_obj = extract_json_from_text(raw)
    pred_norm = normalize_example_json_pred(pred_obj)
    true_json = ex['json_data']
    # evaluate_json expects a JSON string for predicted (older API) -> pass json.dumps
    f1 = custom_metrics.evaluate_json(true_json, json.dumps(pred_norm, ensure_ascii=False))
    results.append({'idx': idx, 'f1': f1, 'raw': raw, 'pred_norm': pred_norm, 'true': true_json})
    print(f"idx={idx} f1={f1:.4f}")

Evaluaci√≥n:   0%|          | 0/45 [00:00<?, ?it/s]

idx=0 f1=0.7560
idx=1 f1=0.1036
idx=2 f1=0.1515
idx=3 f1=0.0970
idx=4 f1=0.6542
idx=5 f1=0.7296
idx=6 f1=0.1194
idx=7 f1=0.7507
idx=8 f1=0.2282
idx=9 f1=0.7840
idx=10 f1=0.6747
idx=11 f1=0.6241
idx=12 f1=0.6892
idx=13 f1=0.0340
idx=14 f1=0.6106
idx=15 f1=0.7108
idx=16 f1=0.5680
idx=17 f1=0.8383
idx=18 f1=0.0388
idx=19 f1=0.0818
idx=20 f1=0.6293
idx=21 f1=0.5693
idx=22 f1=0.7289
idx=23 f1=0.8193
idx=24 f1=0.7632
idx=25 f1=0.0424
idx=26 f1=0.9587
idx=27 f1=0.0340
idx=28 f1=0.6164
idx=29 f1=0.0424
idx=30 f1=0.6178
idx=31 f1=0.8373
idx=32 f1=0.6854
idx=33 f1=0.8138
idx=34 f1=0.6371
idx=35 f1=0.6417
idx=36 f1=0.7110
idx=37 f1=0.6697
idx=38 f1=0.1036
idx=39 f1=0.8349
idx=40 f1=0.8209
idx=41 f1=0.7289
idx=42 f1=0.0941
idx=43 f1=0.7000
idx=44 f1=0.8193


In [23]:
# Guardar CSV
csv_path = os.path.join(OUTPUT_DIR, 'validation_results.csv')
with open(csv_path, 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['idx','f1','raw','pred_norm','true'])
    writer.writeheader()
    for r in results:
        writer.writerow({
            'idx': r['idx'],
            'f1': r['f1'],
            'raw': r['raw'],
            'pred_norm': json.dumps(r['pred_norm'], ensure_ascii=False),
            'true': json.dumps(r['true'], ensure_ascii=False)
        })
print('CSV guardado en', csv_path)

CSV guardado en ./qwen_04\validation_results.csv


In [24]:
# Histograma F1
f1_scores = [r['f1'] for r in results]
plt.figure()
plt.hist(f1_scores, bins=10)
plt.title('Distribuci√≥n de F1')
plt.xlabel('F1')
plt.ylabel('Frecuencia')
plt.savefig(os.path.join(OUTPUT_DIR, 'f1_distribution.png'))
plt.close()
print('Histograma guardado en', os.path.join(OUTPUT_DIR, 'f1_distribution.png'))

Histograma guardado en ./qwen_04\f1_distribution.png


In [25]:
# Mostrar peores 3 ejemplos
sorted_by_f1 = sorted(results, key=lambda x: x['f1'])
print('\nPeores 3 ejemplos:')
for r in sorted_by_f1[:3]:
    print(r['idx'], r['f1'])
    print('Texto:', r['raw'])
    print('Pred_normalizado:', r['pred_norm'])
    print('True:', r['true'])
    print('-'*150)


Peores 3 ejemplos:
13 0.03398058252427184
Texto: Eres un extractor de √≥rdenes. Genera SOLO un JSON v√°lido EXACTAMENTE con los campos requeridos.
Reglas:
- Usa null cuando un campo no exista.
- "buyer" debe existir; si name/email/contact/addresses faltan, d√©jalos en null.
- Si addresses est√° vac√≠o o no existe -> "addresses": null.
- Si purchases est√° vac√≠o o no existe -> "purchases": null.
- shipping es opcional; si falta -> "shipping": null.
- Asegura que los tipos principales sean correctos (quantity entero, country uno de US/CA/GB/ES/CO/DE/FR).

Texto:
# Orden de Compra de Melissa Higgins

Hola! Espero que todo est√© bien por all√≠. Estoy escribiendo para confirmar un pedido que he realizado recientemente. Melissa, que es mi amiga, se ha encargado de hacer algunas compritas interesantes. 

## Informaci√≥n del Comprador

Melissa Higgins es el nombre de la compradora. La √∫ltima vez que hablamos, me coment√≥ que su email es **melissa.higgins@gmail.com**. Si quisieran contactarl

In [None]:
# 13. Guardar modelo adaptado final (LoRA adapter + tokenizer)
model.save_pretrained(os.path.join(OUTPUT_DIR, 'final_lora_adapter'))
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'final_tokenizer'))
print('Artefactos guardados en', OUTPUT_DIR)