# codigo base

In [None]:
# Instala las bibliotecas necesarias
!pip install -q transformers
!pip install -q torch
!pip install -q tqdm

In [None]:
from transformers import BertForQuestionAnswering, BertTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import json
import numpy as np
from sklearn.metrics import mean_squared_error

In [None]:
# Configuración del entorno
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Ruta al archivo de datos en Google Drive
file_path = '/content/drive/MyDrive/FULL-Seminario/Proyectos/rac_gpt/prompts/pruebas/PREGUNTAS COMPLETAS/RAC1_copia/Copia de PR2_35_G.txt'

# Lee y carga el conjunto de datos
with open(file_path, 'r') as file:
    data = json.load(file)

Mounted at /content/drive


In [None]:
# Dividir el conjunto de datos en entrenamiento y validación
train_data = data[:int(0.8 * len(data))]
val_data = data[int(0.8 * len(data)):]

class QADataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        question = item["question"]
        answer = item["answer"]

        # Tokeniza la pregunta y la respuesta
        encoding = self.tokenizer(question, answer, return_tensors='pt', truncation=True, padding=True, max_length=256)
        inputs = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

        # Calcula las posiciones de inicio y fin en la secuencia tokenizada
        tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'])
        start_positions = tokens.index('[SEP]') + 1  # Se asume que la respuesta comienza después del token [SEP]
        end_positions = start_positions + len(self.tokenizer.tokenize(answer)) - 1

        return inputs, start_positions, end_positions

In [None]:
# Inicializa el modelo BERT y el tokenizador
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased', return_dict=True).to(device)

# Configuración del optimizador
optimizer = AdamW(model.parameters(), lr=2e-5)

# DataLoader de entrenamiento
train_dataloader = DataLoader(QADataset(train_data, tokenizer), batch_size=1, shuffle=True)

# Configuración de entrenamiento
num_epochs = 5

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Bucle de entrenamiento
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        inputs, start_positions, end_positions = batch
        inputs = {key: value.to(device) for key, value in inputs.items()}
        start_positions, end_positions = start_positions.to(device), end_positions.to(device)

        # Forward pass
        outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)

        # Cálculo de la pérdida
        loss = outputs.loss

        # Backward pass y optimización
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [None]:
# DataLoader de validación
val_dataloader = DataLoader(QADataset(val_data, tokenizer), batch_size=1, shuffle=False)

In [None]:
# Función de evaluación
def evaluate_model_rmse(model, dataloader, tokenizer, device):
    model.eval()
    all_predicted_start_positions = []
    all_predicted_end_positions = []
    all_true_start_positions = []
    all_true_end_positions = []

    with torch.no_grad():
        for batch in dataloader:
            inputs, start_positions, end_positions = batch
            inputs = {key: value.to(device) for key, value in inputs.items()}
            start_positions, end_positions = start_positions.to(device), end_positions.to(device)

            # Forward pass
            outputs = model(**inputs)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

            # Apply threshold to get positions with probabilities above the threshold
            predicted_start_positions = torch.argmax(start_logits, dim=1).cpu().numpy()
            predicted_end_positions = torch.argmax(end_logits, dim=1).cpu().numpy()

            all_predicted_start_positions.append(predicted_start_positions)
            all_predicted_end_positions.append(predicted_end_positions)
            all_true_start_positions.append(start_positions.cpu().numpy())
            all_true_end_positions.append(end_positions.cpu().numpy())

    # Concatenate all batches
    all_predicted_start_positions = np.concatenate(all_predicted_start_positions)
    all_predicted_end_positions = np.concatenate(all_predicted_end_positions)
    all_true_start_positions = np.concatenate(all_true_start_positions)
    all_true_end_positions = np.concatenate(all_true_end_positions)

    # Compute RMSE
    rmse_start = mean_squared_error(all_predicted_start_positions, all_true_start_positions, squared=False)
    rmse_end = mean_squared_error(all_predicted_end_positions, all_true_end_positions, squared=False)

    return rmse_start, rmse_end

In [None]:
# Resultado
rmse_start, rmse_end = evaluate_model_rmse(model, val_dataloader, tokenizer, device)

print(f"Avg RMSE Start: {rmse_start}")
print(f"Avg RMSE End: {rmse_end}")

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Avg RMSE Start: 0.0
Avg RMSE End: 33.07831616028845


RMSE = 0: Perfecto, las predicciones coinciden exactamente con las etiquetas reales.
RMSE bajo: Buen rendimiento, las predicciones son muy cercanas a las etiquetas reales.
RMSE moderado: Aceptable, las predicciones están en una banda razonable alrededor de las etiquetas reales.
RMSE alto: Pobre rendimiento, las predicciones están considerablemente lejos de las etiquetas reales.

# nuevas metricas

In [None]:
# Instala las bibliotecas necesarias
!pip install -q transformers
!pip install -q torch
!pip install -q tqdm
!pip install -q sacrebleu

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m102.4/106.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
from transformers import BertForQuestionAnswering, BertTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import json
import numpy as np
import sacrebleu
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import single_meteor_score

In [None]:
# Configuración del entorno
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Ruta al archivo de datos en Google Drive
file_path = '/content/drive/MyDrive/FULL-Seminario/Proyectos/rac_gpt/prompts/pruebas/PREGUNTAS COMPLETAS/RAC1_copia/Copia de PR2_35_G.txt'

# Lee y carga el conjunto de datos
with open(file_path, 'r') as file:
    data = json.load(file)

In [None]:
# Dividir el conjunto de datos en entrenamiento y validación
train_data = data[:int(0.8 * len(data))]
val_data = data[int(0.8 * len(data)):]

class QADataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        question = item["question"]
        answer = item["answer"]

        # Tokeniza la pregunta y la respuesta
        encoding = self.tokenizer(question, answer, return_tensors='pt', truncation=True, padding=True, max_length=256)
        inputs = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

        # Calcula las posiciones de inicio y fin en la secuencia tokenizada
        tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'])
        start_positions = tokens.index('[SEP]') + 1  # Se asume que la respuesta comienza después del token [SEP]
        end_positions = start_positions + len(self.tokenizer.tokenize(answer)) - 1

        return inputs, start_positions, end_positions


In [None]:
# Inicializa el modelo BERT y el tokenizador
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased', return_dict=True).to(device)

# Configuración del optimizador
optimizer = AdamW(model.parameters(), lr=2e-5)

# DataLoader de entrenamiento y validación
train_dataloader = DataLoader(QADataset(train_data, tokenizer), batch_size=1, shuffle=True)
val_dataloader = DataLoader(QADataset(val_data, tokenizer), batch_size=1, shuffle=False)

# Configuración de entrenamiento
num_epochs = 5

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Bucle de entrenamiento
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        inputs, start_positions, end_positions = batch
        inputs = {key: value.to(device) for key, value in inputs.items()}
        start_positions, end_positions = start_positions.to(device), end_positions.to(device)

        # Forward pass
        outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)

        # Cálculo de la pérdida
        loss = outputs.loss

        # Backward pass y optimización
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [None]:
# Función de evaluación
def evaluate_model_metrics(model, dataloader, tokenizer, device):
    model.eval()
    all_predicted_answers = []
    all_true_answers = []

    with torch.no_grad():
        for batch in dataloader:
            inputs, start_positions, end_positions = batch
            inputs = {key: value.to(device) for key, value in inputs.items()}
            start_positions, end_positions = start_positions.to(device), end_positions.to(device)

            # Forward pass
            outputs = model(**inputs)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

            # Obtener las respuestas predichas
            predicted_start_positions = torch.argmax(start_logits, dim=1)
            predicted_end_positions = torch.argmax(end_logits, dim=1)

            # Convertir las respuestas predichas a texto
            predicted_answers = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_start_positions]

            # Obtener las respuestas reales
            true_start_positions = start_positions.cpu().numpy()
            true_end_positions = end_positions.cpu().numpy()
            true_answers = [tokenizer.decode(ids, skip_special_tokens=True) for ids in true_start_positions]

            # Almacenar las respuestas para el cálculo de métricas
            all_predicted_answers.extend(predicted_answers)
            all_true_answers.extend(true_answers)

    # Calcular métricas BLEU
    bleu_score = sacrebleu.corpus_bleu(all_predicted_answers, [all_true_answers]).score

    # Tokenizar las respuestas predichas para METEOR
    tokenized_predicted_answers = [tokenizer.tokenize(answer) for answer in all_predicted_answers]
    tokenized_true_answers = [tokenizer.tokenize(answer) for answer in all_true_answers]

    # Calcular métricas METEOR
    meteor_scores = [single_meteor_score(true, pred) for true, pred in zip(tokenized_true_answers, tokenized_predicted_answers)]
    avg_meteor_score = np.mean(meteor_scores)

    return bleu_score, avg_meteor_score


In [None]:
# Resultado
bleu_score, avg_meteor_score = evaluate_model_metrics(model, val_dataloader, tokenizer, device)

print(f"BLEU Score: {bleu_score}")
print(f"Avg METEOR Score: {avg_meteor_score}")


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


BLEU Score: 100.00000000000004
Avg METEOR Score: 0.9995
