VERIFICAR RECURSOS DEL SISTEMA

In [None]:
import tensorflow as tf

print("GPUs disponibles:", len(tf.config.experimental.list_physical_devices('GPU')))


In [None]:
import torch

if torch.cuda.is_available():
    print("CUDA está disponible. Número de GPUs:", torch.cuda.device_count())
    print("Nombre de la GPU:", torch.cuda.get_device_name(0))
else:
    print("CUDA no está disponible.")


In [None]:
import os
import multiprocessing
import torch
import tensorflow as tf

def check_cpus():
    print("Número de CPUs (os.cpu_count()):", os.cpu_count())
    print("Número de CPUs (multiprocessing.cpu_count()):", multiprocessing.cpu_count())

def check_gpu_pytorch():
    if torch.cuda.is_available():
        print("CUDA está disponible. Número de GPUs:", torch.cuda.device_count())
        print("Nombre de la GPU:", torch.cuda.get_device_name(0))
    else:
        print("CUDA no está disponible en PyTorch.")

def check_gpu_tensorflow():
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        print("GPUs disponibles en TensorFlow:", len(gpus))
        for gpu in gpus:
            print("Nombre de la GPU:", gpu.name)
    else:
        print("CUDA no está disponible en TensorFlow.")

if __name__ == "__main__":
    check_cpus()
    check_gpu_pytorch()
    check_gpu_tensorflow()


CARGAR Y PREPROCESAR DATOS

In [None]:
import pandas as pd

# Ruta archivo Excel
file_path = '/home/jovyan/work/transcripciones_frases_v5.xlsx'

# Cargar el archivo Excel
df = pd.read_excel(file_path)

#Comprobaciones
assert 'Texto Original' in df.columns, "La columna 'Texto Original' no existe en el archivo Excel"
assert 'Texto Corregido' in df.columns, "La columna 'Texto Corregido' no existe en el archivo Excel"

df.head()


DIVIDIR DATOS DE ENTRENAMIENTO Y VALIDACION

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.head()

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

CARGAR EL MODELO Y EL TOKENIZADOR


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq


# model_name = 't5-base'
model_name = ("vgaraujov/t5-base-spanish")
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


In [None]:

prefix = "corregir: "
max_input_length = 300
max_target_length = 300

# Función de preprocesamiento
def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples['Texto Original']]
    targets = [ex for ex in examples['Texto Corregido']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# PReprocesamiento
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)


In [None]:
#Comprobaciones
print("Estructura del dataset de entrenamiento después del preprocesamiento:")
print(train_dataset)
print("Ejemplo de datos preprocesados:")
print(train_dataset[0])

print("Estructura del dataset de validación después del preprocesamiento:")
print(val_dataset)
print("Ejemplo de datos preprocesados:")
print(val_dataset[0])

In [None]:
import evaluate
from transformers import (
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from datasets import Dataset
import numpy as np
import torch

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

metric = evaluate.load('sacrebleu')

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    torch.cuda.empty_cache()
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    torch.cuda.empty_cache()
    
    return result


class CustomSeq2SeqTrainer(Seq2SeqTrainer):
    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval", max_new_tokens=300):
        self.model.config.max_length = max_new_tokens
        return super().evaluate(eval_dataset=eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)


def check_data(dataset):
    for example in dataset:
        input_ids = example.get("input_ids", None)
        labels = example.get("labels", None)
        if input_ids is None or labels is None:
            print("Error: 'input_ids' o 'labels' no encontrados en el ejemplo", example)
            continue
        if any(id >= tokenizer.vocab_size or id < 0 for id in input_ids):
            print("Error: input_ids fuera de rango", input_ids)
        if any(id >= tokenizer.vocab_size or (id < 0 and id != -100) for id in labels):
            print("Error: labels fuera de rango", labels)

# comprobaciones
check_data(train_dataset)
check_data(val_dataset)


CONFIGURAR Y ENTRENAR EL MODELO

In [None]:

training_args = Seq2SeqTrainingArguments(
    output_dir='/home/jovyan/results',  
    num_train_epochs=8,               
    warmup_steps=100,                    
    weight_decay=0.00001,                 #Desintegración del peso
    learning_rate=2e-5,
    fp16=True,                            #Entrenamiento en punto flotante de 16 bits
    save_total_limit=1,
    predict_with_generate=True,
    generation_num_beams=15,              #Número de haces de búsqueda (mayor valor menor invencion de palabras),
)

trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

#Entrenar
trainer.train()

In [None]:
model_save_path = '/home/jovyan/work/correct_transcription_model_base_v5'
tokenizer_save_path = '/home/jovyan/work/correct_transcription_tokenizer_base_v5'

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

In [None]:
import numpy as np
import evaluate
from torch.utils.data import DataLoader


metric = evaluate.load("sacrebleu")

def compute_metrics(preds, labels):
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return result

def evaluate_model(model, dataloader):
    model.eval()
    preds, labels = [], []
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels_batch = batch['labels'].to(model.device)

        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_target_length)
        
        preds.extend(outputs.cpu().numpy())
        labels.extend(labels_batch.cpu().numpy())
    
    metrics = compute_metrics(preds, labels)
    return metrics

val_dataloader = DataLoader(val_dataset, batch_size=2, collate_fn=data_collator)

#Mover GPU si está disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Evaluar
metrics = evaluate_model(model, val_dataloader)
print(metrics)

In [None]:
def generate_predictions(model, dataloader, tokenizer, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.to(device)
    model.eval()
    predictions = []
    references = []
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_target_length, num_beams=4, early_stopping=True)
        
        preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        refs = tokenizer.batch_decode(labels, skip_special_tokens=True)
        
        predictions.extend(preds)
        references.extend(refs)
    
    return predictions, references


val_dataloader = DataLoader(val_dataset, batch_size=2, collate_fn=data_collator)

val_predictions, val_references = generate_predictions(model, val_dataloader, tokenizer)

references = [[ref] for ref in val_references]

metric = evaluate.load("sacrebleu")
bleu_score = metric.compute(predictions=val_predictions, references=references)

print("BLEU score:", bleu_score)

GENERAR PREDICIONES Y EVALUAR MODELO

In [None]:

def generate_predictions(model, dataloader, tokenizer, max_length=96, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.to(device)
    model.eval()
    predictions = []
    references = []
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length, num_beams=4, early_stopping=True)
        
        preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        refs = tokenizer.batch_decode(labels, skip_special_tokens=True)
        
        predictions.extend(preds)
        references.extend(refs)
    
    return predictions, references

val_dataloader = DataLoader(val_dataset, batch_size=2, collate_fn=data_collator)
val_predictions, val_references = generate_predictions(model, val_dataloader, tokenizer)
references = [[ref] for ref in val_references]

metric = evaluate.load("sacrebleu")
bleu_score = metric.compute(predictions=val_predictions, references=references)
print("BLEU score:", bleu_score)

In [None]:
def generate_predictions(model, tokenizer, dataset, max_length=96, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.to(device)
    model.eval()
    predictions = []
    for i in range(len(dataset)):
        inputs = tokenizer(dataset[i]['Texto Original'], return_tensors="pt", padding=True, truncation=True, max_length=max_length)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        outputs = model.generate(inputs['input_ids'], max_length=max_length, num_beams=4, early_stopping=True)
        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(pred_text)
    return predictions

val_predictions = generate_predictions(model, tokenizer, val_dataset)
references = val_df['Texto Corregido'].tolist()


references = [[ref] for ref in references]

rouge_score = metric.compute(predictions=val_predictions, references=references)
bleu_score = metric.compute(predictions=[pred.split() for pred in val_predictions], references=[[ref.split()] for ref in references])

print("ROUGE score:", rouge_score)
print("BLEU score:", bleu_score)