In [1]:
'''
Huggingface BERT Model: dccuchile/bert-base-spanish-wwm-uncased
https://huggingface.co/dccuchile/bert-base-spanish-wwm-uncased

Necessary installations: Python, Pandas, Numpy, Sklearn, PyTorch, transformers, TrainingArguments, tensorflow, spacy, 
python -m spacy download es_core_news_sm
'''

'\nHuggingface BERT Model: dccuchile/bert-base-spanish-wwm-uncased\nhttps://huggingface.co/dccuchile/bert-base-spanish-wwm-uncased\n\nNecessary installations: Python, Pandas, Numpy, Sklearn, PyTorch, transformers, TrainingArguments, tensorflow, spacy, \npython -m spacy download es_core_news_sm\n'

In [2]:
import pandas as pd
import numpy as np
import sklearn 
from transformers import TrainingArguments

In [3]:
df = pd.read_csv('data/icm_dataset.csv')
print(df.head(25))

                                                input  intent
0                      Quisiera comprar una camiseta.       1
1                     Me gustaría pedir una camiseta.       1
2                        Necesito una camiseta nueva.       1
3                     Quiero una camiseta de algodón.       1
4        Estoy buscando una camiseta en talla grande.       1
5           Deseo comprar una camiseta con estampado.       1
6            ¿Puedo pedir una camiseta en color azul?       1
7            Necesitaría una camiseta de manga corta.       1
8         Estoy interesado en una camiseta deportiva.       1
9            Quiero comprar una camiseta para correr.       1
10           Me gustaría adquirir una camiseta negra.       1
11                Quisiera una camiseta sin etiqueta.       1
12              Necesito una camiseta para un regalo.       1
13         Deseo pedir una camiseta en talla mediana.       1
14              ¿Podría tener una camiseta en oferta?       1
15      

In [4]:
# Parts of Speech (POS)
import spacy

# Carga modelo de Spanish NLP 
nlp = spacy.load("es_core_news_sm")

def get_pos_tags(text):
    # Process the text with spaCy
    doc = nlp(text)
    pos_tags = [(token.text, token.pos_) for token in doc]  # Extract each word and its POS tag 
    return pos_tags

# ejemplo
sentence = "¿Cómo puedo cambiar mi contraseña?"
pos_tags = get_pos_tags(sentence)
print(pos_tags)


[('¿', 'PUNCT'), ('Cómo', 'PRON'), ('puedo', 'AUX'), ('cambiar', 'VERB'), ('mi', 'DET'), ('contraseña', 'NOUN'), ('?', 'PUNCT')]


In [5]:
# Apply POS tagging to a column of df (no sé si mejora el rendimiento del modelo todavia, podemos studiarlo...)
df['pos_tags'] = df['input'].apply(get_pos_tags)

In [6]:
df.head()

Unnamed: 0,input,intent,pos_tags
0,Quisiera comprar una camiseta.,1,"[(Quisiera, VERB), (comprar, VERB), (una, DET)..."
1,Me gustaría pedir una camiseta.,1,"[(Me, PRON), (gustaría, VERB), (pedir, VERB), ..."
2,Necesito una camiseta nueva.,1,"[(Necesito, VERB), (una, DET), (camiseta, NOUN..."
3,Quiero una camiseta de algodón.,1,"[(Quiero, VERB), (una, DET), (camiseta, NOUN),..."
4,Estoy buscando una camiseta en talla grande.,1,"[(Estoy, AUX), (buscando, VERB), (una, DET), (..."


In [7]:
df['intent'].dtype

dtype('int64')

In [8]:
from sklearn.model_selection import train_test_split

train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(df['input'], df['intent'], test_size=0.1)  # Reserving 10% for testing
train_texts, val_texts, train_labels, val_labels = train_test_split(train_val_texts, train_val_labels, test_size=0.2)  # Split remaining 90% into training and validation

Probamos el **tokenizador de BERT** junto con "dccuchile/bert-base-spanish-wwm-cased"

In [38]:
from sklearn.preprocessing import LabelEncoder

# Encodes labels: train, valuation and test
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

In [39]:
from transformers import BertTokenizer

# PRUEBO NUEVO TOKENIZADOR BERT
tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", do_lower_case=False)
#tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')

'''train_encodings = tokenizer.encode_plus(
    list(train_texts),
    add_special_tokens=True,
    truncation=True,
    padding=True,
    max_length=128,
    return_attention_mask=True,
    return_tensors='pt'  # 'pt' indica que queremos tensores de PyTorch como salida
)'''

'''# Extraer los tensores de entrada y máscaras de atención del resultado
input_ids = train_encodings['input_ids']
attention_mask = train_encodings['attention_mask']'''

'''val_encodings = tokenizer.encode_plus(
    list(val_texts),
    add_special_tokens=True,
    truncation=True,
    padding=True,
    max_length=128,
    return_attention_mask=True,
    return_tensors='pt'  # 'pt' indica que queremos tensores de PyTorch como salida
)

test_encodings = tokenizer.encode_plus(
    list(test_texts),
    add_special_tokens=True,
    truncation=True,
    padding=True,
    max_length=128,
    return_attention_mask=True,
    return_tensors='pt'  # 'pt' indica que queremos tensores de PyTorch como salida
)'''

"val_encodings = tokenizer.encode_plus(\n    list(val_texts),\n    add_special_tokens=True,\n    truncation=True,\n    padding=True,\n    max_length=128,\n    return_attention_mask=True,\n    return_tensors='pt'  # 'pt' indica que queremos tensores de PyTorch como salida\n)\n\ntest_encodings = tokenizer.encode_plus(\n    list(test_texts),\n    add_special_tokens=True,\n    truncation=True,\n    padding=True,\n    max_length=128,\n    return_attention_mask=True,\n    return_tensors='pt'  # 'pt' indica que queremos tensores de PyTorch como salida\n)"

In [40]:
#tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") OTRO TOKENIZADOR que probé

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')


In [41]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels_encoded))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels_encoded))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_labels_encoded))

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [42]:
from transformers import BertForSequenceClassification

num_labels = len(set(train_labels_encoded))

# Cargar el modelo BERT preentrenado de hugging
model = BertForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", num_labels=num_labels)
#model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_)) OTRO MODELO que probé

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

model.train()
num_epochs = 3
for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


In [47]:
def evaluate_model(model, loader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.numpy())
            true_labels.extend(labels.numpy())

    predictions = np.array(predictions)  # Convertir a array de NumPy
    true_labels = np.array(true_labels)  # Convertir a array de NumPy

    accuracy = (predictions == true_labels).mean()
    return accuracy

In [48]:
def predict_single_text(text):
    inputs = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors='pt')
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_label_idx = torch.argmax(logits, dim=1).item()
    predicted_label = label_encoder.classes_[predicted_label_idx]
    return predicted_label


In [49]:
# Ejemplo de evaluación sobre el conjunto de validación
val_accuracy = evaluate_model(model, val_loader)
print(f'Validation Accuracy: {val_accuracy}')

# Ejemplo de evaluación sobre el conjunto de prueba
test_accuracy = evaluate_model(model, test_loader)
print(f'Test Accuracy: {test_accuracy}')

Validation Accuracy: 0.8873239436619719
Test Accuracy: 0.95


In [53]:
# Ejemplo de predicción sobre un texto individual
new_text = "Quiero información de este pantalon por favor"
predicted_label = predict_single_text(new_text)
print(f'Predicted Label: {predicted_label}')

Predicted Label: 2


**Matriz de confusión**

In [56]:
from sklearn.metrics import confusion_matrix, classification_report


In [67]:
def evaluate_model(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return true_labels, predictions

def print_cm(true_labels, predictions, label_names):
    cm = confusion_matrix(true_labels, predictions)
    print("Matriz de confusión:")
    print(cm)


In [68]:
# Matriz de confusión de TRAIN
true_labels, predictions = evaluate_model(model, train_loader)

label_names = label_encoder.classes_

print_cm(true_labels, predictions, label_names)

Matriz de confusión:
[[91  2  0]
 [ 4 89  5]
 [ 0  3 88]]
