In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Datos de ejemplo (reemplaza esto con tus propios datos)
texts = ["Este es un texto de ejemplo.", "Otro texto para probar.", "Un tercer texto para la muestra."]
labels = [1, 0, 1]  # Etiquetas correspondientes a los textos (0 o 1)

# Dividir datos en conjuntos de entrenamiento, validación y prueba
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

# Cargar el modelo BERT preentrenado y el tokenizador
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Num_labels es el número de clases

# Función para tokenizar los textos y convertirlos en tensores
def tokenize_text(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

# Tokenizar los textos de entrenamiento, validación y prueba
train_encodings = [tokenize_text(text) for text in train_texts]
val_encodings = [tokenize_text(text) for text in val_texts]
test_encodings = [tokenize_text(text) for text in test_texts]

# Crear conjuntos de datos PyTorch
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for encoding in self.encodings for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)
test_dataset = TextDataset(test_encodings, test_labels)

# Entrenamiento del modelo
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validación del modelo
    model.eval()
    val_predictions = []
    val_true_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            val_predictions.extend(preds.cpu().numpy())
            val_true_labels.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    print(f'Epoch {epoch + 1}: Validation Accuracy: {val_accuracy:.4f}')

# Evaluación del modelo en el conjunto de prueba
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

model.eval()
test_predictions = []
test_true_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_predictions.extend(preds.cpu().numpy())
        test_true_labels.extend(labels.cpu().numpy())

# Calcular métricas de evaluación en el conjunto de prueba
test_accuracy = accuracy_score(test_true_labels, test_predictions)
print(f'Test Accuracy: {test_accuracy}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for encoding in self.encodings for key, val in encoding.items()}


Epoch 1: Validation Accuracy: 0.0000


  item = {key: torch.tensor(val[idx]) for encoding in self.encodings for key, val in encoding.items()}


Epoch 2: Validation Accuracy: 0.0000
Epoch 3: Validation Accuracy: 0.0000
Test Accuracy: 0.0


  item = {key: torch.tensor(val[idx]) for encoding in self.encodings for key, val in encoding.items()}
  item = {key: torch.tensor(val[idx]) for encoding in self.encodings for key, val in encoding.items()}
