<a href="https://colab.research.google.com/github/Aylin-Rodriguez/Aylin-Rodriguez/blob/main/mBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets scikit-learn openpyxl pandas
!pip install tensorflow-cpu
!pip install "numpy<2.0"


In [None]:
import pandas as pd
import torch
# Configurar dispositivo de cómputo
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Lee el archivo
file_path = '/workspace/dataset_7labels.xlsx'  # Cambia por el nombre correcto del archivo
xls = pd.ExcelFile(file_path)

# Cargar las hojas Training y References
df_training = pd.read_excel(xls, sheet_name="Training")
df_references = pd.read_excel(xls, sheet_name="References")

# Combinar ambas hojas en un único dataframe
df_combined = pd.concat([df_training, df_references], ignore_index=True)

# Dividir el dataset en entrenamiento (70%) y validación (30%)
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df_combined, test_size=0.3, random_state=42)





In [None]:
from transformers import BertTokenizer

# Cargar el tokenizer de BERT Multilingüe
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

# Preprocesar las preguntas y respuestas
def preprocess_data(df):
    # Convierte las columnas de 'Pregunta' y 'Respuesta' a strings y maneja NaNs
    df['Pregunta'] = df['Pregunta'].astype(str).fillna('')
    df['Respuesta'] = df['Respuesta'].astype(str).fillna('')
    inputs = tokenizer(
        df['Pregunta'].tolist(),
        df['Respuesta'].tolist(),
        max_length=128,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    return inputs


# Preprocesar los datos de entrenamiento y validación
train_inputs = preprocess_data(train_df)
val_inputs = preprocess_data(val_df)




In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

# Crear un Dataset personalizado para BERT
class BERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}

        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Convertir etiquetas a formato numérico
train_labels = train_df['Puntos'].values
val_labels = val_df['Puntos'].values

# Calcular el mínimo y máximo de las etiquetas
label_min = train_labels.min()
label_max = train_labels.max()

# Normalizar etiquetas de entrenamiento y validación
train_labels_normalized = (train_labels - label_min) / (label_max - label_min)
val_labels_normalized = (val_labels - label_min) / (label_max - label_min)


# Crear datasets y DataLoaders
train_dataset = BERTDataset(train_inputs, train_labels_normalized)
val_dataset = BERTDataset(val_inputs, val_labels_normalized)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [None]:
from transformers import BertForSequenceClassification

# Cargar el modelo de BERT para clasificación de secuencias
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=1)
model.to(device)


In [None]:
# Función para evaluar el modelo y almacenar predicciones
def evaluate_model_and_store(model, val_loader):
    model.eval()
    true_labels = []
    predictions = []
    predictions_desnormalized = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            logits = outputs.logits.squeeze().cpu().numpy()
            labels = labels.cpu().numpy()
            # Desnormalizar las predicciones
            desnormalized_logits = logits * (label_max - label_min) + label_min

            predictions.extend(logits)
            true_labels.extend(labels)
            predictions_desnormalized.extend(desnormalized_logits)

    return predictions, true_labels, predictions_desnormalized

In [None]:
from torch.optim import AdamW  # Usar la implementación de PyTorch
from transformers import get_scheduler

epochs= 10

# Configurar optimizador
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)  # Agregar weight decay
num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Inicializar listas para almacenar resultados
results = []
predictions_list = []

# Ciclo de entrenamiento modificado para almacenar resultados
for epoch in range(epochs):  # Ajusta el número de épocas según sea necesario
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids= token_type_ids, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()

    # Evaluar el modelo después de cada época
    epoch_predictions, epoch_labels, epoch_predictions_desnormalized = evaluate_model_and_store(model, val_loader)
    results.append({'Epoch': epoch + 1, 'Loss': total_loss})
    predictions_list.extend(zip(epoch_labels, epoch_predictions, epoch_predictions_desnormalized))

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")


Epoch 1/10, Loss: 2.5376
Epoch 2/10, Loss: 6.2035
Epoch 3/10, Loss: 2.4982
Epoch 4/10, Loss: 1.0203
Epoch 5/10, Loss: 0.4518
Epoch 6/10, Loss: 0.1713
Epoch 7/10, Loss: 0.0671
Epoch 8/10, Loss: 0.0249
Epoch 9/10, Loss: 0.0081
Epoch 10/10, Loss: 0.0042


In [None]:
# Guardar métricas en Excel
results_df = pd.DataFrame(results)
results_df.to_excel('/workspace/training_metrics.xlsx', index=False)

# Guardar predicciones en Excel
predictions_df = pd.DataFrame(predictions_list, columns=['True Label Normalized', 'Predicted Normalized', 'Predicted Desnormalized'])
print(predictions_df)
# Supongamos que estos son los valores mínimo y máximo originales de las etiquetas
label_min = 0  # Reemplaza con el valor real
label_max = 7  # Reemplaza con el valor real

# Agregar la columna de True Label Original al dataframe
predictions_df['True Label Original'] = predictions_df['True Label Normalized'] * (label_max - label_min) + label_min

# Guardar los resultados en un nuevo archivo Excel
predictions_df.to_excel('/workspace/predictions_with_true_labels.xlsx', index=False)



print("Resultados guardados en 'training_metrics.xlsx' y 'predictions.xlsx'")

      True Label Normalized  Predicted Normalized  Predicted Desnormalized
0                  0.428571              0.858262                 6.007833
1                  0.714286              0.644851                 4.513955
2                  0.571429              0.500728                 3.505096
3                  0.000000              0.556332                 3.894327
4                  1.000000              0.650930                 4.556510
...                     ...                   ...                      ...
5235               1.000000              0.558807                 3.911651
5236               0.000000              0.425803                 2.980623
5237               0.000000              0.719115                 5.033805
5238               0.285714              0.430026                 3.010182
5239               1.000000              0.880983                 6.166881

[5240 rows x 3 columns]
Resultados guardados en 'training_metrics.xlsx' y 'predictions.xlsx'


In [None]:
# Guardar el modelo entrenado
from transformers import BertForSequenceClassification

model.save_pretrained("bert_trained_model")
tokenizer.save_pretrained("bert_trained_model")
