In [1]:
'''
Huggingface BERT Model: dccuchile/bert-base-spanish-wwm-uncased
https://huggingface.co/dccuchile/bert-base-spanish-wwm-uncased

Necessary installations: Python, Pandas, Numpy, Sklearn, PyTorch, transformers, TrainingArguments, tensorflow, spacy, 
python -m spacy download es_core_news_sm
'''

'\nHuggingface BERT Model: dccuchile/bert-base-spanish-wwm-uncased\nhttps://huggingface.co/dccuchile/bert-base-spanish-wwm-uncased\n\nNecessary installations: Python, Pandas, Numpy, Sklearn, PyTorch, transformers, TrainingArguments, tensorflow, spacy, \npython -m spacy download es_core_news_sm\n'

In [13]:
import pandas as pd
import numpy as np
import sklearn 
from transformers import TrainingArguments

In [14]:
df = pd.read_csv('data/icm_dataset.csv')
print(df.head(25))

                                                input  intent
0                      Quisiera comprar una camiseta.       1
1                     Me gustaría pedir una camiseta.       1
2                        Necesito una camiseta nueva.       1
3                     Quiero una camiseta de algodón.       1
4        Estoy buscando una camiseta en talla grande.       1
5           Deseo comprar una camiseta con estampado.       1
6            ¿Puedo pedir una camiseta en color azul?       1
7            Necesitaría una camiseta de manga corta.       1
8         Estoy interesado en una camiseta deportiva.       1
9            Quiero comprar una camiseta para correr.       1
10           Me gustaría adquirir una camiseta negra.       1
11                Quisiera una camiseta sin etiqueta.       1
12              Necesito una camiseta para un regalo.       1
13         Deseo pedir una camiseta en talla mediana.       1
14              ¿Podría tener una camiseta en oferta?       1
15      

In [15]:
from sklearn.model_selection import train_test_split

train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(df['input'], df['intent'], test_size=0.1)  # Reserving 10% for testing
train_texts, val_texts, train_labels, val_labels = train_test_split(train_val_texts, train_val_labels, test_size=0.2)  # Split remaining 90% into training and validation

In [16]:
from sklearn.preprocessing import LabelEncoder

# Encodes labels: train, valuation and test
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

In [17]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", do_lower_case=False)

In [18]:
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')

In [19]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels_encoded))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels_encoded))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_labels_encoded))

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [20]:
from transformers import BertForSequenceClassification

num_labels = len(set(train_labels_encoded))

# Cargar el modelo BERT preentrenado de hugging
model = BertForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
import numpy as np

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

model.train()
num_epochs = 3
for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


# Guardar las clases de label_encoder despues de entrenar
np.save('label_encoder_classes.npy', label_encoder.classes_)

In [22]:
def evaluate_model(model, loader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.numpy())
            true_labels.extend(labels.numpy())

    predictions = np.array(predictions)  # Convertir a array de NumPy
    true_labels = np.array(true_labels)  # Convertir a array de NumPy

    accuracy = (predictions == true_labels).mean()
    return accuracy

def predict_single_text(text):
    inputs = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors='pt')
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_label_idx = torch.argmax(logits, dim=1).item()
    predicted_label = label_encoder.classes_[predicted_label_idx]
    return predicted_label

In [23]:
# Ejemplo de evaluación sobre el conjunto de validación
val_accuracy = evaluate_model(model, val_loader)
print(f'Validation Accuracy: {val_accuracy}')

# Ejemplo de evaluación sobre el conjunto de prueba
test_accuracy = evaluate_model(model, test_loader)
print(f'Test Accuracy: {test_accuracy}')

# Ejemplo de predicción sobre un texto individual
new_text = "Quiero información de este pantalon"
predicted_label = predict_single_text(new_text)
print(f'Predicted Label: {predicted_label}')

Validation Accuracy: 0.9027777777777778
Test Accuracy: 0.925
Predicted Label: 2
