In [1]:
import sys
sys.path.append("../") 

In [2]:
import json
from transformers import AutoTokenizer
from data.collator import CustomDataCollator

In [3]:
# Cargar tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token 
# Crear el collator
collator = CustomDataCollator(tokenizer, mlm_probability=0.15, static_masking=False)

In [4]:
# Leer algunos ejemplos desde el JSONL generado
ruta = "../data/train.jsonl"
ejemplos = []
with open(ruta, "r", encoding="utf-8") as f:
    for i, linea in enumerate(f):
        if i >= 8:
            break
        ejemplos.append(json.loads(linea))

# Aplicar el collator a los ejemplos
batch = collator(ejemplos)
print("Input IDs:\n", batch["input_ids"])
print("Labels:\n", batch["labels"])


Input IDs:
 tensor([[ 3198,  1110,    11,  ...,  1123,   584, 50256],
        [50256,   484,  5201,  ...,  7342, 50256, 50256],
        [ 5667,  2121, 50256,  ...,   290,   531, 50256],
        ...,
        [  531, 50256,   366,  ...,  1266,  2460, 50256],
        [   11,   257, 50256,  ...,   640,    11, 50256],
        [  287,   257,  1263,  ..., 50256,   467,    11]])
Labels:
 tensor([[-100, -100, -100,  ..., -100, -100,   13],
        [2293, -100, -100,  ..., -100,  262, -100],
        [-100, -100,  319,  ..., -100, -100, -100],
        ...,
        [-100,   11, -100,  ..., -100, -100, -100],
        [-100, -100, 3797,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., 5045, -100, -100]])


In [5]:
from datasets import load_dataset
from torch.utils.data import DataLoader

In [6]:
# Dataset HuggingFace desde tu archivo JSONL
dataset = load_dataset("json", data_files="../data/train.jsonl", split="train")

# Collator personalizado
collator = CustomDataCollator(tokenizer, mlm_probability=0.15, static_masking=False)

# DataLoader con batches
loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collator)

# Verificar un lote
batch = next(iter(loader))
print("input_ids:", batch["input_ids"].shape)
print("labels:", batch["labels"].shape)

input_ids: torch.Size([4, 128])
labels: torch.Size([4, 128])
