In [None]:
!pip install datasets transformers
!pip install pytorch-lightning
!pip install spacy
!python -m spacy download es_core_news_md

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64

# Librerías

In [76]:
# Librerías estándar de Python
import random

# Librerías para manejo de datos
import pandas as pd

# Librerías de PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence

# Librerías de PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

# Librerías para métricas
from torchmetrics import Accuracy, Precision, Recall, F1Score

# Librerías de datasets
from datasets import load_dataset

#SpaCy
import spacy

# Set de datos

## Importación

In [78]:
# Cargar el conjunto de datos CoNLL-2002 para español
dataset = load_dataset("conll2002", "es")
#print(type(dataset))

# Divisiones disponibles en el dataset
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

# Mostrar algunos ejemplos
print("Ejemplo del conjunto de datos de entrenamiento:")

print(train_dataset[2])

# Desplegar todos los atributos y métodos disponibles
#display(dir(train_dataset))

# Mostrar las columnas
if train_dataset.column_names == validation_dataset.column_names == test_dataset.column_names:
    print("\nColumnas disponibles en todos los conjuntos de datos:", train_dataset.column_names)
else:
    print("\nError: Columnas disponibles en los conjuntos de datos no coinciden.")

# Mostrar las etiquetas
if train_dataset.features['ner_tags'].feature.names == validation_dataset.features['ner_tags'].feature.names == test_dataset.features['ner_tags'].feature.names:
    print("\nEtiquetas disponibles en todos los conjuntos de datos:")
    label_mapping = pd.DataFrame({
        "label": train_dataset.features['ner_tags'].feature.names})
    print(label_mapping)

else:
    print("\nError: Etiquetas disponibles en los conjuntos de datos no coinciden.")

# Mostrar el valor de shape
print("\nTamaño del conjunto de datos de entrenamiento:", train_dataset.shape)
print("Tamaño del conjunto de datos de validación:", validation_dataset.shape)
print("Tamaño del conjunto de datos de prueba:", test_dataset.shape)


Ejemplo del conjunto de datos de entrenamiento:
{'id': '2', 'tokens': ['El', 'Abogado', 'General', 'del', 'Estado', ',', 'Daryl', 'Williams', ',', 'subrayó', 'hoy', 'la', 'necesidad', 'de', 'tomar', 'medidas', 'para', 'proteger', 'al', 'sistema', 'judicial', 'australiano', 'frente', 'a', 'una', 'página', 'de', 'internet', 'que', 'imposibilita', 'el', 'cumplimiento', 'de', 'los', 'principios', 'básicos', 'de', 'la', 'Ley', '.'], 'pos_tags': [4, 28, 1, 40, 28, 13, 47, 28, 13, 47, 38, 4, 28, 40, 49, 28, 40, 49, 40, 28, 1, 1, 38, 40, 7, 28, 40, 28, 35, 47, 4, 28, 40, 4, 28, 1, 40, 4, 28, 20], 'ner_tags': [0, 1, 2, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0]}

Columnas disponibles en todos los conjuntos de datos: ['id', 'tokens', 'pos_tags', 'ner_tags']

Etiquetas disponibles en todos los conjuntos de datos:
    label
0       O
1   B-PER
2   I-PER
3   B-ORG
4   I-ORG
5   B-LOC
6   I-LOC
7  B-MISC
8  I-MISC

Tamaño del conjun

## Exploración

In [79]:
random_id = random.randint(0, len(train_dataset) - 1)
print(f'Sentence id = {random_id}')
random_sentence = pd.DataFrame (
    {
        "tokens": train_dataset[random_id]["tokens"],
        "ner_tags": train_dataset[random_id]["ner_tags"]
    }
)
print(random_sentence)

Sentence id = 347
          tokens  ner_tags
0           Ante         0
1           esta         0
2          nueva         0
3      situación         0
4              ,         0
5             el         0
6   comentarista         0
7        militar         0
8            del         0
9    prestigioso         0
10        diario         0
11       israelí         0
12             "         0
13       Haaretz         3
14             "         0
15             ,         0
16          Zeev         1
17        Schiff         2
18             ,         0
19         llega         0
20             a         0
21            la         0
22    conclusión         0
23            de         0
24           que         0
25             "         0
26            la         0
27      retirada         0
28           del         0
29        Líbano         5
30            no         0
31        supone         0
32            el         0
33           fin         0
34           del         0
35     con

# Modelos ya entrenados para tareas NER

## SpaCy

In [105]:
# Cargar el modelo de español preentrenado
spacy_model = spacy.load("es_core_news_md")



In [107]:
# Texto de entrada
texto = "Rambo nunca había visto un paisaje tan hermoso en las montañas."

# Procesar el texto con el modelo NER
doc = spacy_model(texto)

#display(dir(doc))
print("Entidades detectadas:")
print(doc.ents)

# Mostrar entidades detectadas
print("\nEntidades PER detectadas:")
for ent in doc.ents:
    # Tipo: MISC, esto es utilizado cuando el modelo no logra definir de qué tipo es.
    if ent.label_ == "PER":  # "PER" etiqueta para personas
        print(f"Nombre detectado: {ent.text}")


Entidades detectadas:
(Rambo,)

Entidades PER detectadas:


# BiLSTMModel

Librerías

In [74]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import pytorch_lightning as pl
from datasets import load_dataset
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torchmetrics import Accuracy, Precision, Recall, F1Score


## Opción 1: BiLSTMModel con label por token (Bidirectional Long Short-Term Memory)

In [102]:
def transform_labels(examples):
    """
    Esta función transforma las etiquetas en binario: 1 (nombre de persona) y 0 (otro).
    Utiliza los valores de la columna ner_tags para identificar los nombres de personas.
    """
    examples['labels'] = [1 if tag in [1, 2] else 0 for tag in examples['ner_tags']]
    return examples

# Aplicar la transformación a cada conjunto de datos
train_dataset = dataset['train'].map(transform_labels)
validation_dataset = dataset['validation'].map(transform_labels)
test_dataset = dataset['test'].map(transform_labels)

# Mostrar un ejemplo para comprobar la transformación
print("Ejemplo del conjunto de datos transformado:", train_dataset[2])

class NERDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        tokens = self.dataset[idx]['tokens']
        labels = self.dataset[idx]['labels']
        tokens_ids = [hash(token) % 10000 for token in tokens]
        return torch.tensor(tokens_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

# Función de collate para realizar padding
def collate_fn(batch):
    tokens, labels = zip(*batch)
    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)
    return tokens_padded, labels_padded

# Crear DataLoaders con la nueva función de collate
train_loader = DataLoader(NERDataset(train_dataset), batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(NERDataset(validation_dataset), batch_size=32, collate_fn=collate_fn)
test_loader = DataLoader(NERDataset(test_dataset), batch_size=32, collate_fn=collate_fn)

class BiLSTMModel(pl.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.loss = nn.BCEWithLogitsLoss()

        # Métricas
        self.train_accuracy = Accuracy(task="binary")
        self.val_accuracy = Accuracy(task="binary")
        self.val_precision = Precision(task="binary")
        self.val_recall = Recall(task="binary")
        self.val_f1 = F1Score(task="binary")

        # Para acumular predicciones y etiquetas de validación
        self.val_preds = []
        self.val_labels = []

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.bilstm(embedded)
        logits = self.fc(lstm_out)
        return logits.squeeze(-1)

    def training_step(self, batch, batch_idx):
        tokens, labels = batch
        outputs = self(tokens)
        loss = self.loss(outputs, labels.float())
        preds = torch.sigmoid(outputs) > 0.5
        accuracy = self.train_accuracy(preds, labels.int())
        self.log('train_loss', loss, prog_bar=True)
        self.log('train_accuracy', accuracy, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        tokens, labels = batch
        outputs = self(tokens)
        loss = self.loss(outputs, labels.float())
        preds = torch.sigmoid(outputs) > 0.5
        self.val_preds.append(preds)
        self.val_labels.append(labels.int())
        self.log('val_loss', loss, prog_bar=True)
        return loss

    def on_validation_epoch_end(self):
        all_preds = torch.cat([torch.flatten(p) for p in self.val_preds])
        all_labels = torch.cat([torch.flatten(l) for l in self.val_labels])

        val_accuracy = self.val_accuracy(all_preds, all_labels)
        val_precision = self.val_precision(all_preds, all_labels)
        val_recall = self.val_recall(all_preds, all_labels)
        val_f1 = self.val_f1(all_preds, all_labels)

        self.log('val_accuracy', val_accuracy, prog_bar=True)
        self.log('val_precision', val_precision, prog_bar=True)
        self.log('val_recall', val_recall, prog_bar=True)
        self.log('val_f1', val_f1, prog_bar=True)

        # Limpiar las listas para la próxima época
        self.val_preds.clear()
        self.val_labels.clear()

    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)

    def on_test_epoch_end(self):
        return self.on_validation_epoch_end()

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=1e-3)

# Definir parámetros del modelo
vocab_size = 10000
embedding_dim = 128
hidden_dim = 256
output_dim = 1  # Salida binaria (0 o 1)

# Crear el modelo
model = BiLSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)

# Configurar callbacks y logger
checkpoint_callback = ModelCheckpoint(
    monitor='val_f1',
    dirpath='checkpoints',
    filename='bilstm-ner-{epoch:02d}-{val_f1:.2f}',
    save_top_k=3,
    mode='max'
)

early_stopping_callback = EarlyStopping(
    monitor='val_f1',
    min_delta=0.01,
    patience=3,
    mode='max',
    verbose=True
)

logger = TensorBoardLogger("logs", name="bilstm_ner")

# Crear el Trainer
trainer = pl.Trainer(
    max_epochs=10,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    callbacks=[checkpoint_callback, early_stopping_callback],
    logger=logger,
    log_every_n_steps=10
)

# Entrenar el modelo
trainer.fit(model, train_loader, val_loader)

# Imprimir evolución del entrenamiento
print("\nEvolución del entrenamiento:")
print("Mejor modelo guardado en:", checkpoint_callback.best_model_path)
print("Mejor valor de F1-score:", checkpoint_callback.best_model_score.item())

# Evaluar el modelo en el conjunto de prueba
test_result = trainer.test(model, test_loader)
print("\nResultados en el conjunto de prueba:")
print(test_result)


Map:   0%|          | 0/8324 [00:00<?, ? examples/s]

Map:   0%|          | 0/1916 [00:00<?, ? examples/s]

Map:   0%|          | 0/1518 [00:00<?, ? examples/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type              | Params | Mode 
-------------------------------------------------------------
0 | embedding      | Embedding         | 1.3 M  | train
1 | bilstm         | LSTM              | 790 K  | train
2 | fc             | Linear            | 513    | train
3 | loss           | BCEWithLogitsLoss | 0      | train
4 | train_accuracy | BinaryAccuracy    | 0      | train
5 | val_accuracy   | BinaryAccuracy    | 0      | train
6 | val_precision  | BinaryPrecision   | 0      | train
7 | val_recall     | BinaryRecall      | 0      | train
8 | val_f1         | BinaryF1Score     | 0      | train


Ejemplo del conjunto de datos transformado: {'id': '2', 'tokens': ['El', 'Abogado', 'General', 'del', 'Estado', ',', 'Daryl', 'Williams', ',', 'subrayó', 'hoy', 'la', 'necesidad', 'de', 'tomar', 'medidas', 'para', 'proteger', 'al', 'sistema', 'judicial', 'australiano', 'frente', 'a', 'una', 'página', 'de', 'internet', 'que', 'imposibilita', 'el', 'cumplimiento', 'de', 'los', 'principios', 'básicos', 'de', 'la', 'Ley', '.'], 'pos_tags': [4, 28, 1, 40, 28, 13, 47, 28, 13, 47, 38, 4, 28, 40, 49, 28, 40, 49, 40, 28, 1, 1, 38, 40, 7, 28, 40, 28, 35, 47, 4, 28, 40, 4, 28, 1, 40, 4, 28, 20], 'ner_tags': [0, 1, 2, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0], 'labels': [0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.462


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved by 0.185 >= min_delta = 0.01. New best score: 0.647


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved by 0.019 >= min_delta = 0.01. New best score: 0.666


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved by 0.011 >= min_delta = 0.01. New best score: 0.677


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved by 0.013 >= min_delta = 0.01. New best score: 0.691


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved by 0.017 >= min_delta = 0.01. New best score: 0.708


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]



Evolución del entrenamiento:
Mejor modelo guardado en: /content/checkpoints/bilstm-ner-epoch=07-val_f1=0.71.ckpt
Mejor valor de F1-score: 0.7076235413551331


Testing: |          | 0/? [00:00<?, ?it/s]


Resultados en el conjunto de prueba:
[{'val_loss': 0.02958691492676735, 'val_accuracy': 0.994502067565918, 'val_precision': 0.8383392095565796, 'val_recall': 0.6932067275047302, 'val_f1': 0.7588964700698853}]


In [103]:
# Función para predecir si cada token de una frase contiene datos personales
def predict_personal_data_1(model, sentence):
    # Dividir la frase en tokens
    tokens = sentence.split()
    # Convertir los tokens a índices
    tokens_ids = [hash(token) % 10000 for token in tokens]
    # Convertir los índices a un tensor
    tokens_tensor = torch.tensor(tokens_ids, dtype=torch.long).unsqueeze(0)  # Agregar dimensión para batch

    model.eval()
    with torch.no_grad():
        # Pasar los tokens por el modelo
        outputs = model(tokens_tensor)
        # Aplicar la sigmoide para obtener probabilidades por token
        probabilities = torch.sigmoid(outputs).squeeze().tolist()

    # Evaluar cada token y determinar si es dato personal (probabilidad > 0.5)
    token_predictions = [(token, prob > 0.5, prob) for token, prob in zip(tokens, probabilities)]

    return token_predictions


In [104]:
# Espacio para pasar frases al azar
print("\nIngrese frases para evaluar (escriba 'salir' para terminar):")
while True:
    sentence = input("Frase: ")
    if sentence.lower() == 'salir':
        break
    predictions = predict_personal_data_1(model, sentence)
    print("Resultados por token:")
    for token, has_personal_data, probability in predictions:
        print(f"Token: {token} - ¿Contiene datos personales? {'Sí' if has_personal_data else 'No'} - Probabilidad: {probability:.2f}")
    print()



Ingrese frases para evaluar (escriba 'salir' para terminar):
Frase: Mientras Juan Pérez preparaba el desayuno, su teléfono comenzó a sonar.
Resultados por token:
Token: Mientras - ¿Contiene datos personales? No - Probabilidad: 0.29
Token: Juan - ¿Contiene datos personales? Sí - Probabilidad: 1.00
Token: Pérez - ¿Contiene datos personales? Sí - Probabilidad: 0.96
Token: preparaba - ¿Contiene datos personales? No - Probabilidad: 0.01
Token: el - ¿Contiene datos personales? No - Probabilidad: 0.00
Token: desayuno, - ¿Contiene datos personales? No - Probabilidad: 0.00
Token: su - ¿Contiene datos personales? No - Probabilidad: 0.00
Token: teléfono - ¿Contiene datos personales? No - Probabilidad: 0.00
Token: comenzó - ¿Contiene datos personales? No - Probabilidad: 0.00
Token: a - ¿Contiene datos personales? No - Probabilidad: 0.00
Token: sonar. - ¿Contiene datos personales? No - Probabilidad: 0.10

Frase: Olaf Monteverde nunca había visto un paisaje tan hermoso en las montañas.
Resultados p

## Opción 2: BiLSTMModel con label por frase
Aquí no voy a pasar la info de las personas, solo el contexto

In [85]:
def transform_labels(examples):
    examples['has_personal_data'] = 1 if any(tag in [1,2] for tag in examples['ner_tags']) else 0
    return examples

# Aplicar la transformación a cada conjunto de datos
train_dataset = dataset['train'].map(transform_labels)
validation_dataset = dataset['validation'].map(transform_labels)
test_dataset = dataset['test'].map(transform_labels)

class NERDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        tokens = self.dataset[idx]['tokens']
        label = self.dataset[idx]['has_personal_data']
        tokens_ids = [hash(token) % 10000 for token in tokens]
        return torch.tensor(tokens_ids, dtype=torch.long), torch.tensor(label, dtype=torch.float)

def collate_fn(batch):
    tokens, labels = zip(*batch)
    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)
    return tokens_padded, labels

# Crear DataLoaders
train_loader = DataLoader(NERDataset(train_dataset), batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(NERDataset(validation_dataset), batch_size=32, collate_fn=collate_fn)
test_loader = DataLoader(NERDataset(test_dataset), batch_size=32, collate_fn=collate_fn)

class BiLSTMModel(pl.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.loss = nn.BCEWithLogitsLoss()

        # Métricas
        self.train_accuracy = Accuracy(task="binary")
        self.val_accuracy = Accuracy(task="binary")
        self.val_precision = Precision(task="binary")
        self.val_recall = Recall(task="binary")
        self.val_f1 = F1Score(task="binary")

        # Para acumular predicciones y etiquetas de validación
        self.val_preds = []
        self.val_labels = []

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.bilstm(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(hidden).squeeze(1)

    def training_step(self, batch, batch_idx):
        tokens, labels = batch
        outputs = self(tokens)
        loss = self.loss(outputs, labels)
        preds = torch.sigmoid(outputs) > 0.5
        accuracy = self.train_accuracy(preds, labels.int())
        self.log('train_loss', loss, prog_bar=True)
        self.log('train_accuracy', accuracy, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        tokens, labels = batch
        outputs = self(tokens)
        loss = self.loss(outputs, labels)
        preds = torch.sigmoid(outputs) > 0.5
        self.val_preds.append(preds)
        self.val_labels.append(labels.int())
        self.log('val_loss', loss, prog_bar=True)
        return loss

    def on_validation_epoch_end(self):
        all_preds = torch.cat(self.val_preds)
        all_labels = torch.cat(self.val_labels)

        val_accuracy = self.val_accuracy(all_preds, all_labels)
        val_precision = self.val_precision(all_preds, all_labels)
        val_recall = self.val_recall(all_preds, all_labels)
        val_f1 = self.val_f1(all_preds, all_labels)

        self.log('val_accuracy', val_accuracy, prog_bar=True)
        self.log('val_precision', val_precision, prog_bar=True)
        self.log('val_recall', val_recall, prog_bar=True)
        self.log('val_f1', val_f1, prog_bar=True)

        # Limpiar las listas para la próxima época
        self.val_preds.clear()
        self.val_labels.clear()

    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)

    def on_test_epoch_end(self):
        return self.on_validation_epoch_end()

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=1e-3)

# Definir parámetros del modelo
vocab_size = 10000
embedding_dim = 128
hidden_dim = 256

# Crear el modelo
BiLSTMModel_2 = BiLSTMModel(vocab_size, embedding_dim, hidden_dim)

# Configurar callbacks y logger
checkpoint_callback = ModelCheckpoint(
    monitor='val_f1',
    dirpath='checkpoints',
    filename='bilstm-ner-{epoch:02d}-{val_f1:.2f}',
    save_top_k=3,
    mode='max'
)


# Configuración de EarlyStopping
early_stopping_callback = EarlyStopping(
    monitor='val_f1',
    min_delta=0.01,  # Mínima mejora requerida para considerar una mejora
    patience=3,      # Número de épocas sin mejora para detener
    mode='max',
    verbose=True
)

logger = TensorBoardLogger("logs", name="bilstm_ner")

# Crear el Trainer
trainer = pl.Trainer(
    max_epochs=10,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    callbacks=[checkpoint_callback, early_stopping_callback],
    logger=logger,
    log_every_n_steps=10
)

# Entrenar el modelo
trainer.fit(BiLSTMModel_2, train_loader, val_loader)

# Imprimir evolución del entrenamiento
print("\nEvolución del entrenamiento:")
print("Mejor modelo guardado en:", checkpoint_callback.best_model_path)
print("Mejor valor de F1-score:", checkpoint_callback.best_model_score.item())

# Evaluar el modelo en el conjunto de prueba
test_result = trainer.test(BiLSTMModel_2, test_loader)
print("\nResultados en el conjunto de prueba:")
print(test_result)


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:652: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type              | Params | Mode 
-------------------------------------------------------------
0 | embedding      | Embedding         | 1.3 M  | train
1 | bilstm         | LSTM              | 790 K  | train
2 | fc             | Linear            | 513    | train
3 | loss           | BCEWithLogitsLoss | 0      | train
4 | train_accuracy | BinaryAccuracy    | 0      | train
5 | val_accuracy   | BinaryAccuracy    | 0      | train
6 | val_

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.547


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved by 0.169 >= min_delta = 0.01. New best score: 0.716


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved by 0.022 >= min_delta = 0.01. New best score: 0.737


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.737. Signaling Trainer to stop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]



Evolución del entrenamiento:
Mejor modelo guardado en: /content/checkpoints/bilstm-ner-epoch=03-val_f1=0.74.ckpt
Mejor valor de F1-score: 0.7372262477874756


/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]


Resultados en el conjunto de prueba:
[{'val_loss': 0.5465196967124939, 'val_accuracy': 0.8399209380149841, 'val_precision': 0.7364341020584106, 'val_recall': 0.7802874445915222, 'val_f1': 0.7577268481254578}]


In [86]:
# Función para predecir si una frase contiene datos personales
def predict_personal_data(model, sentence):
    tokens = sentence.split()
    tokens_ids = [hash(token) % 10000 for token in tokens]
    tokens_tensor = torch.tensor(tokens_ids, dtype=torch.long).unsqueeze(0)

    model.eval()
    with torch.no_grad():
        output = model(tokens_tensor)
        probability = torch.sigmoid(output).item()

    return probability > 0.5, probability



In [101]:
# Espacio para pasar frases al azar
print("\nIngrese frases para evaluar (escriba 'salir' para terminar):")
while True:
    sentence = input("Frase: ")
    if sentence.lower() == 'salir':
        break
    has_personal_data, probability = predict_personal_data(BiLSTMModel_2, sentence)
    print(f"¿Contiene datos personales? {'Sí' if has_personal_data else 'No'}")
    print(f"Probabilidad: {probability:.2f}")
    print()


Ingrese frases para evaluar (escriba 'salir' para terminar):
Frase: Maristela caminaba por el parque durante la tarde soleada.
¿Contiene datos personales? No
Probabilidad: 0.41

Frase: salir
