In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

In [2]:
# Load the CSV file into a pandas DataFrame

file = '..\\data\\dataSocialMediaClean.csv'
df = pd.read_csv(file, delimiter='\t', engine='python', header=0, index_col=0)
df.head(2) # Display the first 2 rows of the DataFrame to verify the correct load

Unnamed: 0,id,message,name,id_user,username,id_post,link,date,user_link,weekday,...,sentiment,reply_screen_name,created_at,owner,shortcode,hour,clean_message,clean_message_noemo,emojis_in_message,sentiment_code
0,63d415fe24f17c7a0a38762a,Lo que tiene que hacer el Miss Venezuela es r...,Lisette Diaz,marazul41,marazul41,718139826350485,http://www.facebook.com/MissVenezuelaOficial/p...,2023-01-14,http://www.facebook.com/marazul41,Sábado,...,negativo,MissVenezuelaOficial,0.0,0,0,00:00:00,hacer miss venezuela retirarse miss uni señal ...,hacer miss venezuela retirarse miss uni señal ...,0,-1
1,63d415fe24f17c7a0a38762b,Me imagino que dentro del plan de acción está ...,Kendra Pérez,kendra.pereztabares,kendra.pereztabares,718139826350485,http://www.facebook.com/MissVenezuelaOficial/p...,2023-01-14,http://www.facebook.com/kendra.pereztabares,Sábado,...,negativo,MissVenezuelaOficial,0.0,0,0,00:00:00,imagino dentro plan acción tener sistema votac...,imagino dentro plan acción tener sistema votac...,0,-1


In [3]:
# Create a copy of the DataFrame to manipulate the data
df2 = df.copy()

# Select only the necessary columns
cols = ['id', 'clean_message','sentiment','sentiment_code']
df2 = df2[cols]

# Remap sentiment codes to new values: -1 -> 0, 0 -> 1, 1 -> 2
df2['sentiment_code_new'] = df2['sentiment_code'].map({
    -1: 0,
    0:  1,
    1:  2
})

# Select a random subset of 1000 rows for training and testing the model
df_subset = df2.sample(n=1000, random_state=42)
df_subset


Unnamed: 0,id,clean_message,sentiment,sentiment_code,sentiment_code_new
15316,63d415fe24f17c7a0a38b1fe,😍😍😍😍😍😍😍,positivo,1,2
6391,63d415fe24f17c7a0a388f21,siempre miss universo ❤❤,positivo,1,2
5946,63d415fe24f17c7a0a388d64,ganadora,positivo,1,2
14128,63d415fe24f17c7a0a38ad5a,amada amamos reina excelente trabajo 😍,positivo,1,2
17218,63d415fe24f17c7a0a38b96c,robaron corona reina 😢,negativo,-1,0
...,...,...,...,...,...
11039,63d415fe24f17c7a0a38a149,mano feeeeee ❤,neutro,0,1
1910,63d415fe24f17c7a0a387da0,reinaaa ❤❤,neutro,0,1
7646,63d415fe24f17c7a0a389408,amanda ❤,neutro,0,1
15119,63d415fe24f17c7a0a38b139,nervios punta dios ❤❤❤🔥🔥🔥🔥🔥,negativo,-1,0


In [4]:
# Split data into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df_subset['clean_message'].values, # Training and test texts
                                                                    df_subset['sentiment_code_new'].values, # Training and test label 
                                                                    test_size=0.2, # 20% of the data for testing
                                                                    random_state=42)

In [5]:
# Initialize the BETO (BERT for Spanish) tokenizer
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')

In [6]:
# Tokenize the training and test texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)

In [7]:
# Create training and test datasets
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), # Input IDs of the tokens
                              torch.tensor(train_encodings['attention_mask']), # Attention masks
                              torch.tensor(train_labels)) # Training labels

test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']), # Input IDs of the tokens
                             torch.tensor(test_encodings['attention_mask']), # Attention masks
                             torch.tensor(test_labels)) # Test labels

In [8]:
# Create DataLoaders for training and testing
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [9]:
# Initialize the pre-trained BERT model for sequence classification in Spanish
model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased',
                                                      num_labels=3)  #3 classes: positive|2, negative|0, neutral|1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Check if a GPU is available and move the model to the GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
# Initialize the AdamW optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Función de pérdida
loss_function = torch.nn.CrossEntropyLoss()

# Define the evaluation function
def evaluate(model, dataloader):
    """
    Evaluate the model on the given dataloader.

    Args:
        model: The BERT model for sequence classification.
        dataloader: DataLoader containing the data to evaluate.

    Returns:
        accuracy: Accuracy of the model on the dataloader.
        precision: Precision of the model on the dataloader.
        recall: Recall of the model on the dataloader.
        f1: F1-score of the model on the dataloader.
        cm: Confusion matrix of the model on the dataloader.
    """
    model.eval()        # Set the model to evaluation mode
    all_labels = []     # List to store true labels
    all_preds = []      # List to store model predictions
    with torch.no_grad(): # Disable gradient calculation
        for batch in dataloader:
            input_ids, attention_mask, labels = [t.to(device) for t in batch] # Move data to GPU
            outputs = model(input_ids, attention_mask=attention_mask) # Make predictions
            _, preds = torch.max(outputs.logits, dim=1)  # Get the predictions with the highest probability
            all_labels.extend(labels.cpu().numpy()) # Store true labels
            all_preds.extend(preds.cpu().numpy()) # Store predictions
    
    # Calculate performance metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    cm = confusion_matrix(all_labels, all_preds)
    
    return accuracy, precision, recall, f1, cm

# Training the model
num_epochs = 3

for epoch in range(num_epochs):
    model.train() # Set the model to training mode
    for batch in train_loader:
        input_ids, attention_mask, labels = [t.to(device) for t in batch] # Move data to GPU
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels) # Make predictions
        loss = outputs.loss # Calculate the loss
        loss.backward() # Perform backpropagation
        optimizer.step() # Update model parameters
        optimizer.zero_grad() # Reset gradients
    
    # Evaluate on the test set after each epoch
    accuracy, precision, recall, f1, cm = evaluate(model, test_loader)
    print(f'Epoch {epoch + 1}:')
    print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}')
    print(f'Confusion Matrix:\n{cm}')

Epoch 1:
Accuracy: 0.7600, Precision: 0.6609, Recall: 0.7600, F1-score: 0.7065
Confusion Matrix:
[[ 18   0  12]
 [  3   0  23]
 [  9   1 134]]
Epoch 2:
Accuracy: 0.8150, Precision: 0.8243, Recall: 0.8150, F1-score: 0.8124
Confusion Matrix:
[[ 15   8   7]
 [  1  15  10]
 [  2   9 133]]
Epoch 3:
Accuracy: 0.7750, Precision: 0.7605, Recall: 0.7750, F1-score: 0.7599
Confusion Matrix:
[[ 21   3   6]
 [  3   6  17]
 [ 13   3 128]]


In [12]:
# Final evaluation of the model on the test set
accuracy, precision, recall, f1, cm = evaluate(model, test_loader)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')
print(f'Confusion Matrix:\n{cm}')

Accuracy: 0.7750
Precision: 0.7605
Recall: 0.7750
F1-score: 0.7599
Confusion Matrix:
[[ 21   3   6]
 [  3   6  17]
 [ 13   3 128]]


---

**Summary and Recommendations**  
- Model Performance: The model shows good accuracy, but there is room for improvement in precision and recall, especially for the neutral class.

- Class Imbalance: The confusion matrices suggest possible class imbalance, with the model performing significantly better on the positive class.

- Overfitting: The decrease in performance from epoch 2 to epoch 3 could indicate overfitting. Monitoring validation loss and implementing early stopping could help mitigate this.

- Hyperparameter Tuning: We should experiment with different learning rates, batch sizes, and epochs to find the optimal settings for the dataset.

- Data Augmentation: Increase the dataset size with data augmentation techniques or by collecting more labeled data to improve model robustness.