In [None]:
# Data Augmentation: Sinónimos con spaCy (sin paraphrasing)

import random

import spacy

# Cargar modelo de spaCy para español e inglés
nlp_es = spacy.load('es_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')

def synonym_augmentation(text, lang='en'):
    nlp = nlp_en if lang == 'en' else nlp_es
    doc = nlp(text)
    augmented = []
    for token in doc:
       
        if token.pos_ in ['NOUN', 'VERB']:
            augmented.append(token.text)
        else:
            augmented.append(token.text)
    return ' '.join(augmented)

# Ejemplo de uso profesional
sample_text_en = "This is a sample sentence for data augmentation."
sample_text_es = "Esta es una frase de ejemplo para aumentar datos."
print('Original EN:', sample_text_en)
print('Sinónimos EN:', synonym_augmentation(sample_text_en, lang='en'))
print('Original ES:', sample_text_es)
print('Sinónimos ES:', synonym_augmentation(sample_text_es, lang='es'))

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from tqdm import tqdm
import math
import os
import time
from torch.optim.lr_scheduler import OneCycleLR, ReduceLROnPlateau



In [None]:
# 1. MODELO TRANSFORMER

# Codificación posicional para secuencias, fundamental en arquitecturas Transformer.
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)  # Buffer para asegurar que se mueve con el modelo entre CPU/GPU.

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# Definición del modelo Transformer para traducción automática.
class TransformerModel(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, nhead: int, num_encoder_layers: int, num_decoder_layers: int, dim_feedforward: int, dropout: float = 0.1):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.embedding_dropout = nn.Dropout(p=dropout)
        self.input_norm = nn.LayerNorm(d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True, norm_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers, norm=nn.LayerNorm(d_model))
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True, norm_first=True)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers, norm=nn.LayerNorm(d_model))
        self.output_norm = nn.LayerNorm(d_model)
        self.fc_out = nn.Linear(d_model, vocab_size)
        self._init_weights()

    def _init_weights(self):
        # Inicialización de pesos para mejorar la estabilidad y convergencia del entrenamiento.
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        nn.init.normal_(self.embedding.weight, mean=0, std=self.d_model ** -0.5)

    def generate_square_subsequent_mask(self, sz: int, device: torch.device) -> torch.Tensor:
        # Máscara para evitar que el modelo atienda a posiciones futuras en la secuencia objetivo.
        mask = torch.triu(torch.ones(sz, sz, device=device) * float('-inf'), diagonal=1)
        return mask

    def forward(self, src: torch.Tensor, tgt: torch.Tensor, src_padding_mask: torch.Tensor, tgt_padding_mask: torch.Tensor, tgt_mask: torch.Tensor) -> torch.Tensor:
        # Embedding y normalización de la secuencia fuente.
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.embedding_dropout(src)
        src = self.input_norm(src)
        src = self.pos_encoder(src.transpose(0, 1)).transpose(0, 1)
        memory = self.transformer_encoder(src, src_key_padding_mask=src_padding_mask)
        # Embedding y normalización de la secuencia objetivo.
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        tgt = self.embedding_dropout(tgt)
        tgt = self.input_norm(tgt)
        tgt = self.pos_encoder(tgt.transpose(0, 1)).transpose(0, 1)
        output = self.transformer_decoder(tgt, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask, memory_key_padding_mask=src_padding_mask)
        output = self.output_norm(output)
        return self.fc_out(output)

In [None]:
# 2. CONFIGURACIÓN Y PREPROCESAMIENTO DE DATOS (CHUNKED TRAINING)

def setup_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        torch.backends.cudnn.benchmark = True
        return device
    else:
        return torch.device("cpu")

DEVICE = setup_device()

PAD_IDX, SOS_IDX, EOS_IDX = 256, 257, 258
# Vocabulario byte-level: 256 bytes + 3 tokens especiales (PAD, SOS, EOS)
VOCAB_SIZE_BYTES = 259 + 3

D_MODEL = 512
NHEAD = 8
NUM_ENCODER_LAYERS = 6
NUM_DECODER_LAYERS = 6
DIM_FEEDFORWARD = 2048
DROPOUT = 0.1

EPOCHS_PER_CHUNK = 10
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = 2
LEARNING_RATE = 0.0005
WARMUP_STEPS = 500
MAX_GRAD_NORM = 1.0

USE_MIXED_PRECISION = torch.cuda.is_available()

def encode_bytes(text):
    byte_seq = list(text.encode('utf-8'))[:128]
    return [SOS_IDX] + byte_seq + [EOS_IDX]

#Chunked data 
from datasets import load_dataset

chunk_slices = [
    "train[:20%]",
    "train[20%:40%]",
    "train[40%:60%]",
    "train[60%:80%]",
    "train[80%:100%]"
]

def augment_translation_pair_full(pair, lang='en'):
    src, tgt = pair['en'], pair['es']
    src_syn = synonym_augmentation(src, lang='en')
    tgt_syn = synonym_augmentation(tgt, lang='es')
    return [
        {'en': src, 'es': tgt},
        {'en': src_syn, 'es': tgt},
        {'en': src, 'es': tgt_syn},
        {'en': src_syn, 'es': tgt_syn}
    ]

def prepare_chunk_data(raw_datasets):
    augmented_examples = []
    for item in raw_datasets:
        augmented_examples.extend(augment_translation_pair_full(item['translation']))
    bidirectional_examples = []
    for ex in augmented_examples:
        bidirectional_examples.append({'src_bytes': encode_bytes(ex['en']), 'tgt_bytes': encode_bytes(ex['es'])})
        bidirectional_examples.append({'src_bytes': encode_bytes(ex['es']), 'tgt_bytes': encode_bytes(ex['en'])})
    import random
    random.shuffle(bidirectional_examples)
    split_idx = int(len(bidirectional_examples) * 0.9)
    train_data = bidirectional_examples[:split_idx]
    val_data = bidirectional_examples[split_idx:]
    return train_data, val_data

def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for item in batch:
        src_batch.append(torch.tensor(item['src_bytes']))
        tgt_batch.append(torch.tensor(item['tgt_bytes']))
    src_padded = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=PAD_IDX)
    tgt_padded = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=PAD_IDX)
    return src_padded, tgt_padded

In [None]:
# Inicialización del modelo y optimizador antes de cargar el checkpoint
VOCAB_SIZE = VOCAB_SIZE_BYTES if 'VOCAB_SIZE_BYTES' in globals() else 15000 + 50
D_MODEL = D_MODEL if 'D_MODEL' in globals() else 512
NHEAD = NHEAD if 'NHEAD' in globals() else 8
NUM_ENCODER_LAYERS = NUM_ENCODER_LAYERS if 'NUM_ENCODER_LAYERS' in globals() else 6
NUM_DECODER_LAYERS = NUM_DECODER_LAYERS if 'NUM_DECODER_LAYERS' in globals() else 6
DIM_FEEDFORWARD = DIM_FEEDFORWARD if 'DIM_FEEDFORWARD' in globals() else 2048
DROPOUT = DROPOUT if 'DROPOUT' in globals() else 0.1
LEARNING_RATE = LEARNING_RATE if 'LEARNING_RATE' in globals() else 0.0005
PAD_IDX = PAD_IDX if 'PAD_IDX' in globals() else 256
import torch.nn as nn
model = TransformerModel(VOCAB_SIZE, D_MODEL, NHEAD, NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, DIM_FEEDFORWARD, DROPOUT)
model = model.to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

In [None]:
# Preparación profesional de los chunks para entrenamiento con opus100 (en-es)
from datasets import load_dataset

opus100_chunk_slices = [
    "train[:20%]",
    "train[20%:40%]",
    "train[40%:60%]",
    "train[60%:80%]",
    "train[80%:100%]"
    ]

def augment_translation_pair_full_opus100(pair):
    # opus100 tiene formato {'translation': {'en': ..., 'es': ...}}
    trans = pair.get('translation')
    if trans is None:
        raise KeyError(f"No se encontró la clave 'translation' en el par: {pair}")
    src = trans.get('en')
    tgt = trans.get('es')
    if src is None or tgt is None:
        raise KeyError(f"No se encontraron las claves 'en' y 'es' en el par: {pair}")
    src_syn = synonym_augmentation(src, lang='en')
    tgt_syn = synonym_augmentation(tgt, lang='es')
    return [
        {'en': src, 'es': tgt},
        {'en': src_syn, 'es': tgt},
        {'en': src, 'es': tgt_syn},
        {'en': src_syn, 'es': tgt_syn}
    ]

def prepare_chunk_data_opus100(raw_datasets):
    augmented_examples = []
    for item in raw_datasets:
        augmented_examples.extend(augment_translation_pair_full_opus100(item))
    bidirectional_examples = []
    for ex in augmented_examples:
        bidirectional_examples.append({'src_bytes': encode_bytes(ex['en']), 'tgt_bytes': encode_bytes(ex['es'])})
        bidirectional_examples.append({'src_bytes': encode_bytes(ex['es']), 'tgt_bytes': encode_bytes(ex['en'])})
    import random
    random.shuffle(bidirectional_examples)
    split_idx = int(len(bidirectional_examples) * 0.9)
    train_data = bidirectional_examples[:split_idx]
    val_data = bidirectional_examples[split_idx:]
    return train_data, val_data

opus100_dataloaders = []
for chunk_idx, chunk_slice in enumerate(opus100_chunk_slices):
    print(f"\n=== Preparando chunk {chunk_idx+1}/{len(opus100_chunk_slices)}: {chunk_slice} ===")
    raw_datasets = load_dataset("opus100", "en-es", split=chunk_slice)
    train_data, val_data = prepare_chunk_data_opus100(raw_datasets)
    train_dl = DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    val_dl = DataLoader(val_data, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    opus100_dataloaders.append((train_dl, val_dl))
    print(f"  - Ejemplos de entrenamiento: {len(train_data)}")
    print(f"  - Ejemplos de validación: {len(val_data)}")
    print(f"  - Batches de entrenamiento: {len(train_dl)}")
    print(f"  - Batches de validación: {len(val_dl)}")

In [None]:
# Fine-tuning incremental con los chunks de opus100, comenzando desde best_model_doctrans_chunk5.pt
import os
from torch.optim.lr_scheduler import OneCycleLR
from tqdm import tqdm
import time

# Cargar pesos iniciales desde best_model_doctrans_chunk5.pt
init_ckpt_path = r'C:\Users\Luis\Downloads\entrenamiento\best_model_doctrans_chunk5.pt'
if os.path.exists(init_ckpt_path):
    checkpoint = torch.load(init_ckpt_path, map_location=DEVICE)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print(f"Pesos iniciales cargados desde {init_ckpt_path}.")
else:
    print(f"No se encontró {init_ckpt_path}, se inicia con pesos actuales.")

EPOCHS_PER_CHUNK = 10
train_losses = []
val_losses = []
best_val_losses_por_chunk = []
last_train_losses_por_chunk = []

for chunk_idx, (train_dl, val_dl) in enumerate(opus100_dataloaders):
    # Si no es el primer chunk, carga el checkpoint del chunk anterior
    if chunk_idx > 0:
        prev_ckpt_path = f'best_model_opus100_chunk{chunk_idx}.pt'
        if os.path.exists(prev_ckpt_path):
            checkpoint = torch.load(prev_ckpt_path, map_location=DEVICE)
            model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            print(f"Pesos cargados desde {prev_ckpt_path} para chunk {chunk_idx+1}.")
        else:
            print(f"No se encontró {prev_ckpt_path}, se continúa con los pesos actuales.")

    print(f"\n=== Fine-tuning en chunk {chunk_idx+1}/{len(opus100_dataloaders)} ===")
    total_steps = len(train_dl) * EPOCHS_PER_CHUNK
    scheduler = OneCycleLR(optimizer, max_lr=LEARNING_RATE, total_steps=total_steps, pct_start=0.1, anneal_strategy='cos')
    best_val_loss = float('inf')
    last_train_loss = None
    patience = 5
    patience_counter = 0
    for epoch in range(1, EPOCHS_PER_CHUNK + 1):
        print(f"\n--- Epoch {epoch}/{EPOCHS_PER_CHUNK} ---")
        start_time = time.time()
        train_loss = train_epoch(model, optimizer, nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1), train_dl, scheduler, scaler)
        val_loss = evaluate(model, nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1), val_dl)
        elapsed = time.time() - start_time
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        last_train_loss = train_loss
        print(f"Train loss: {train_loss:.4f}")
        print(f"Validation loss: {val_loss:.4f}")
        print(f"Current learning rate: {scheduler.get_last_lr()[0]:.6f}")
        print(f"Epoch time: {elapsed:.2f} seconds")
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'best_val_loss': best_val_loss,
            }, f'best_model_opus100_chunk{chunk_idx+1}.pt')
            print(f"Model saved (val_loss: {val_loss:.4f})")
        else:
            patience_counter += 1
            print(f"Patience: {patience_counter}/{patience}")
            if patience_counter >= patience:
                print(f"Early stopping activated at epoch {epoch}")
                break
    best_val_losses_por_chunk.append(best_val_loss)
    last_train_losses_por_chunk.append(last_train_loss)
print("Fine-tuning incremental completado con opus100")

In [None]:
# Fine-tuning secuencial: opus100 -> NickyNicky/Colossal
import os
from torch.optim.lr_scheduler import OneCycleLR
from tqdm import tqdm
import time

def train_with_dataloaders(dataloaders, checkpoint_prefix, initial_ckpt_path=None):
    global model, optimizer
    train_losses = []
    val_losses = []
    best_val_losses_por_chunk = []
    last_train_losses_por_chunk = []
    # Cargar pesos iniciales si se especifica
    if initial_ckpt_path is not None and os.path.exists(initial_ckpt_path):
        checkpoint = torch.load(initial_ckpt_path, map_location=DEVICE)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f"Pesos iniciales cargados desde {initial_ckpt_path}.")
    elif initial_ckpt_path is not None:
        print(f"No se encontró {initial_ckpt_path}, se inicia con pesos actuales.")
    for chunk_idx, (train_dl, val_dl) in enumerate(dataloaders):
        # Si no es el primer chunk, carga el checkpoint del chunk anterior
        if chunk_idx > 0:
            prev_ckpt_path = f'{checkpoint_prefix}_chunk{chunk_idx}.pt'
            if os.path.exists(prev_ckpt_path):
                checkpoint = torch.load(prev_ckpt_path, map_location=DEVICE)
                model.load_state_dict(checkpoint['model_state_dict'])
                optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
                print(f"Pesos cargados desde {prev_ckpt_path} para chunk {chunk_idx+1}.")
            else:
                print(f"No se encontró {prev_ckpt_path}, se continúa con los pesos actuales.")

        print(f"\n=== Fine-tuning en chunk {chunk_idx+1}/{len(dataloaders)} ===")
        total_steps = len(train_dl) * EPOCHS_PER_CHUNK
        scheduler = OneCycleLR(optimizer, max_lr=LEARNING_RATE, total_steps=total_steps, pct_start=0.1, anneal_strategy='cos')
        best_val_loss = float('inf')
        last_train_loss = None
        patience = 5
        patience_counter = 0
        for epoch in range(1, EPOCHS_PER_CHUNK + 1):
            print(f"\n--- Epoch {epoch}/{EPOCHS_PER_CHUNK} ---")
            start_time = time.time()
            train_loss = train_epoch(model, optimizer, nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1), train_dl, scheduler, scaler)
            val_loss = evaluate(model, nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1), val_dl)
            elapsed = time.time() - start_time
            train_losses.append(train_loss)
            val_losses.append(val_loss)
            last_train_loss = train_loss
            print(f"Train loss: {train_loss:.4f}")
            print(f"Validation loss: {val_loss:.4f}")
            print(f"Current learning rate: {scheduler.get_last_lr()[0]:.6f}")
            print(f"Epoch time: {elapsed:.2f} seconds")
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict': scheduler.state_dict(),
                    'best_val_loss': best_val_loss,
                }, f'{checkpoint_prefix}_chunk{chunk_idx+1}.pt')
                print(f"Model saved (val_loss: {val_loss:.4f})")
            else:
                patience_counter += 1
                print(f"Patience: {patience_counter}/{patience}")
                if patience_counter >= patience:
                    print(f"Early stopping activated at epoch {epoch}")
                    break
        best_val_losses_por_chunk.append(best_val_loss)
        last_train_losses_por_chunk.append(last_train_loss)
    print(f"Fine-tuning incremental completado con {checkpoint_prefix}")
    print(f"Mejor validation loss final: {best_val_loss:.4f}")
    # Cargar el último checkpoint
    last_ckpt_path = f'{checkpoint_prefix}_chunk{len(dataloaders)}.pt'
    if os.path.exists(last_ckpt_path):
        checkpoint = torch.load(last_ckpt_path, map_location=DEVICE)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f"Pesos cargados desde {last_ckpt_path} para continuar el siguiente entrenamiento.")
    else:
        print(f"No se encontró {last_ckpt_path}, se continúa con los pesos actuales.")

# --- Entrenamiento secuencial ---
print("Entrenando con opus100...")
train_with_dataloaders(opus100_dataloaders, 'best_model_opus100', initial_ckpt_path=r'C:\Users\Luis\Downloads\entrenamiento\best_model_doctrans_chunk5.pt')

# Preparar dataloaders para NickyNicky/Colossal
colossal_chunk_slices = [
    "train[:20%]",
    "train[20%:40%]",
    "train[40%:60%]",
    "train[60%:80%]",
    "train[80%:100%]"
]
def augment_translation_pair_full_colossal(pair):
    src = pair.get('english') or pair.get('en') or pair.get('source')
    tgt = pair.get('spanish') or pair.get('es') or pair.get('target')
    if src is None or tgt is None:
        if pair.get('task') == 'en_es' and pair.get('prompt') and pair.get('chosen'):
            src = pair['prompt']
            tgt = pair['chosen']
        elif pair.get('task') == 'es_en' and pair.get('prompt') and pair.get('chosen'):
            src = pair['chosen']
            tgt = pair['prompt']
        else:
            raise KeyError(f"No se encontraron las claves correctas en el par: {pair}")
    src_syn = synonym_augmentation(src, lang='en')
    tgt_syn = synonym_augmentation(tgt, lang='es')
    return [
        {'en': src, 'es': tgt},
        {'en': src_syn, 'es': tgt},
        {'en': src, 'es': tgt_syn},
        {'en': src_syn, 'es': tgt_syn}
    ]
def prepare_chunk_data_colossal(raw_datasets):
    augmented_examples = []
    for item in raw_datasets:
        augmented_examples.extend(augment_translation_pair_full_colossal(item))
    bidirectional_examples = []
    for ex in augmented_examples:
        bidirectional_examples.append({'src_bytes': encode_bytes(ex['en']), 'tgt_bytes': encode_bytes(ex['es'])})
        bidirectional_examples.append({'src_bytes': encode_bytes(ex['es']), 'tgt_bytes': encode_bytes(ex['en'])})
    import random
    random.shuffle(bidirectional_examples)
    split_idx = int(len(bidirectional_examples) * 0.9)
    train_data = bidirectional_examples[:split_idx]
    val_data = bidirectional_examples[split_idx:]
    return train_data, val_data
colossal_dataloaders = []
for chunk_idx, chunk_slice in enumerate(colossal_chunk_slices):
    print(f"\n=== Preparando chunk {chunk_idx+1}/{len(colossal_chunk_slices)}: {chunk_slice} ===")
    raw_datasets = load_dataset("NickyNicky/Colossal_Translation_Spanish_to_English_AND_English_to_Spanish_ORPO_DPO_Gemma", split=chunk_slice)
    train_data, val_data = prepare_chunk_data_colossal(raw_datasets)
    train_dl = DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    val_dl = DataLoader(val_data, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    colossal_dataloaders.append((train_dl, val_dl))
    print(f"  - Ejemplos de entrenamiento: {len(train_data)}")
    print(f"  - Ejemplos de validación: {len(val_data)}")
    print(f"  - Batches de entrenamiento: {len(train_dl)}")
    print(f"  - Batches de validación: {len(val_dl)}")

print("Entrenando con NickyNicky/Colossal...")
train_with_dataloaders(colossal_dataloaders, 'best_model_colossal', initial_ckpt_path=f'best_model_opus100_chunk{len(opus100_dataloaders)}.pt')