Antes de correr este c√≥digo correr el script ¬¥Creacion_tokenizer_nuevo_modelo.py¬¥

In [1]:
import torch
from transformers import MarianTokenizer, MarianMTModel, DataCollatorForSeq2Seq, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset, Dataset
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device(
	"mps" if torch.backends.mps.is_available()
	else "cuda" if torch.cuda.is_available()
	else "cpu"
)
print(f"üîß Usando dispositivo: {device}")

üîß Usando dispositivo: mps


In [3]:
tokenizer = MarianTokenizer.from_pretrained("tokenizer_gallego_expandido")
model = MarianMTModel.from_pretrained("modelo_gallego_expandido")



In [4]:
sample_gl_text = "Ola mundo, como est√°s?"
tokenized_sample = tokenizer(sample_gl_text, return_tensors="pt")
print(f"Tokens gallego: {tokenizer.convert_ids_to_tokens(tokenized_sample['input_ids'][0])}")

Tokens gallego: ['‚ñÅOl', 'a', '‚ñÅ', 'mun', 'do', '‚ñÅ', ',', 'como', 'est√°', '‚ñÅ', 's', '?', '</s>']


In [5]:
lora_config = LoraConfig(
    r=32,  # Aumentado para mejor capacidad
    lora_alpha=64,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "out_proj",  # Attention layers
        "fc1", "fc2"  # Feed-forward layers
               # Embedding layers - CR√çTICOS para nuevos tokens
        "embed_tokens",  # Input embeddings - donde est√°n los nuevos tokens
        
        # Output projection - NECESARIO para generar tokens gallegos
        "lm_head"  # Final projection to vocabulary
    ],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
print(f"üìä Par√°metros entrenables: {model.num_parameters()}")
print(f"üìä Par√°metros LoRA: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

üìä Par√°metros entrenables: 243537088
üìä Par√°metros LoRA: 8486080




In [6]:
ds_en = load_dataset("openlanguagedata/flores_plus", "eng_Latn", split="dev")
ds_gl = load_dataset("openlanguagedata/flores_plus", "glg_Latn", split="dev")

# Crear dataset paralelo
parallel_data = []
for e, g in zip(ds_en, ds_gl):
    if len(e["text"].strip()) > 0 and len(g["text"].strip()) > 0:
        parallel_data.append({"en": e["text"], "gl": g["text"]})

print(f"‚úÖ Dataset cargado: {len(parallel_data)} pares de frases")
    

‚úÖ Dataset cargado: 997 pares de frases


In [7]:
dataset = Dataset.from_list(parallel_data)

# Dividir en train/validation
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [8]:
def preprocess_function(examples):
    # Procesar en lotes
    sources = examples["en"]
    targets = examples["gl"]
    
    # Tokenizar fuentes y objetivos en una sola llamada
    model_inputs = tokenizer(
        sources,
        text_target=targets,  # Nuevo m√©todo recomendado
        max_length=128, 
        padding="max_length", 
        truncation=True,
        return_tensors="pt"
    )
    
    # Reemplazar padding tokens en labels con -100
    model_inputs["labels"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in model_inputs["labels"]
    ]
    
    return model_inputs

# Aplicar preprocesamiento
print("üîÑ Preprocesando datos...")
train_dataset = train_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=train_dataset.column_names
)
eval_dataset = eval_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=eval_dataset.column_names
)


üîÑ Preprocesando datos...


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 897/897 [00:01<00:00, 888.52 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 1013.60 examples/s]


In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=15,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=None,  # Desactivar wandb/tensorboard
    dataloader_pin_memory=False,  # Para MPS
)

# 8. Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
trainer.train()
print("‚úÖ Entrenamiento completado!")

# Guardar modelo
print("üíæ Guardando modelo...")
trainer.save_model("opus-en-gl-lora")
tokenizer.save_pretrained("opus-en-gl-lora")

# Guardar modelo fusionado
print("üîó Fusionando y guardando modelo final...")
model = model.merge_and_unload()
model.save_pretrained("opus-en-gl-lora-fused")
tokenizer.save_pretrained("opus-en-gl-lora-fused")

print("‚úÖ Modelo guardado exitosamente!")

Step,Training Loss,Validation Loss
200,4.4738,4.071459
400,3.6019,3.548279
600,3.327,3.375678
800,3.1204,3.315549
1000,2.8853,3.262609
1200,2.8596,3.217035
1400,2.7492,3.187695
1600,2.6755,3.160486
1800,2.6825,3.154196
2000,2.6213,3.146463




‚úÖ Entrenamiento completado!
üíæ Guardando modelo...
üîó Fusionando y guardando modelo final...


```python
from transformers import AutoModelForCausalLM

# Load original tied model
model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it", tie_word_embeddings=False)

# Set the randomly initialized lm_head to the previously tied embeddings
model.lm_head.weight.data = model.model.embed_tokens.weight.data.clone()

# Save the untied model
untied_model_dir = "dir/for/untied/model"
model.save_pretrained(untied_model_dir)
model.config.save_pretrained(untied_model_dir)

# Now use the original model but in untied format
model = AutoModelForCausalLM.from_pretrained(untied_model_dir)
```



‚úÖ Modelo guardado exitosamente!


Prueba

In [11]:
from evaluate import load

In [12]:
def test_translation_batch(sentences_en, references_gl, model_path="opus-en-gl-lora-fused"):
    try:
        test_model = MarianMTModel.from_pretrained(model_path)
        test_model.eval()
        test_model.to(device)

        test_tokenizer = MarianTokenizer.from_pretrained(model_path)

        inputs = test_tokenizer(sentences_en, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            outputs = test_model.generate(
                **inputs,
                max_length=128,
                num_beams=4,
                early_stopping=True,
                do_sample=False
            )

        translations = [test_tokenizer.decode(out, skip_special_tokens=True) for out in outputs]

        for src, pred, ref in zip(sentences_en, translations, references_gl):
            print(f"EN: {src}")
            print(f"GL (pred): {pred}")
            print(f"GL (ref):  {ref}")
            print("-" * 60)

        # Calcular m√©tricas BLEU y chrF en lote
        print("\nüìä M√©tricas globales:")
        bleu = load("bleu")
        chrf = load("chrf")

        bleu_score = bleu.compute(predictions=translations, references=[[r] for r in references_gl])
        chrf_score = chrf.compute(predictions=translations, references=references_gl)

        print(f"BLEU: {bleu_score['bleu']:.4f}")
        print(f"chrF: {chrf_score['score']:.2f}")

    except Exception as e:
        print(f"‚ùå Error en traducci√≥n: {e}")

In [13]:
ds_en = load_dataset("openlanguagedata/flores_plus", "eng_Latn", split="devtest[:50]")
ds_gl = load_dataset("openlanguagedata/flores_plus", "glg_Latn", split="devtest[:50]")

# Tomar una muestra de 50 ejemplos para visualizaci√≥n
sample_en = ds_en["text"][:50]
sample_gl = ds_gl["text"][:50]

test_translation_batch(sample_en, sample_gl)



EN: "We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.
GL (pred): "Non mos camundongos de 4 me se s de camundongos que no n diab√©ticas que utilizan o diab√©tico", frisou o diab√©tico.
GL (ref):  "Agora temos ratos de 4 meses que xa non son diab√©ticos, pero que no seu momento si que o foron", engadiu.
------------------------------------------------------------
EN: Dr. Ehud Ur, professor of medicine at Dalhousie University in Halifax, Nova Scotia and chair of the clinical and scientific division of the Canadian Diabetes Association cautioned that the research is still in its early days.
GL (pred): O Dr. Ehud Ur, professor de me dicine da Universi da de de Dalhousie en Halifax, Nova Sc√≥cia, e presidente da divisi√≥n cl√≠nica e cientifica de que a cl√≠nica de diabete de Ca na dian acauterou que o arco de re se nto se est√° no m√°is de diabete.
GL (ref):  O Dr. Ehud Ur, profesor de medicina na Universidade Dalhousie en Halifax (Nova Escocia) e presi

In [15]:
test_sentences = [
    "Hello, how are you?",
    "I love learning languages.",
    "The weather is beautiful today.",
]
test_translation_batch(test_sentences, ["Ola, como est√°s?", "G√∫stame aprender idiomas.", "O tempo √© fermoso hoxe."], model_path="opus-en-gl-lora-fused")



EN: Hello, how are you?
GL (pred): H el lo, e tu?
GL (ref):  Ola, como est√°s?
------------------------------------------------------------
EN: I love learning languages.
GL (pred): Amo a aprendizagem de idiomas.
GL (ref):  G√∫stame aprender idiomas.
------------------------------------------------------------
EN: The weather is beautiful today.
GL (pred): O clima √© bel√≠ssimo para o dia.
GL (ref):  O tempo √© fermoso hoxe.
------------------------------------------------------------

üìä M√©tricas globales:
BLEU: 0.0000
chrF: 24.41
