Extraction

In [4]:
import os
import pandas as pd
csv_path = "../../data/spanish/spanish_tweets.csv"
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,user,text,date,emotion,sentiment
0,@erreborda,termine bien abrumado despu√©s de hoy,"Jan 6, 2024 ¬∑ 2:53 AM UTC",overwhelmed,scared
1,@shpiderduck,me siento abrumado,"Jan 6, 2024 ¬∑ 2:35 AM UTC",overwhelmed,scared
2,@Alex_R_art,Me siento un poco abrumado por la cantidad de ...,"Jan 6, 2024 ¬∑ 12:20 AM UTC",overwhelmed,scared
3,@anggelinaa97,Salvador la √∫nica persona que no la ha abrumad...,"Jan 5, 2024 ¬∑ 10:38 PM UTC",overwhelmed,scared
4,@diegoreyesvqz,Denme un helado o algo que ando full abrumado.,"Jan 5, 2024 ¬∑ 8:38 PM UTC",overwhelmed,scared


Cleaning data

In [5]:
import pandas as pd
import numpy as np
# removing usernames
df["clean_text"] = df["text"].str.replace(r"@\w+", "", regex=True).str.strip()
# removing urls
df["clean_text"] = df["clean_text"].str.replace(r"http\S+", "", regex=True)
# removing empty texts
df = df[df["clean_text"].str.len() > 0]
df["clean_text"].head()

df["sentiment"].unique()
#mapping sentiment labels to numbers
mapping = {'sad': 0, 'mad': 1, 'scared': 2, 'joyful': 3, 'peaceful': 4, 'powerful': 5}
df["sentiment_labels"] = df['sentiment'].map(mapping)
df.tail()


Unnamed: 0,user,text,date,emotion,sentiment,clean_text,sentiment_labels
2585,@lavivianaleyva,No podemos vivir con miedo: ¬°Manejen borrach...,"Jan 6, 2024 ¬∑ 3:08 AM UTC",daring,joyful,No podemos vivir con miedo: ¬°Manejen borrach...,3
2586,@Carmeen_Alicia,"La vida es un constante, SIN MIEDO AL √âXITO üíÖüèº","Jan 6, 2024 ¬∑ 3:07 AM UTC",daring,joyful,"La vida es un constante, SIN MIEDO AL √âXITO üíÖüèº",3
2587,@homicidios_,Esquizofrenia = mente dividida: Miedo a las re...,"Jan 6, 2024 ¬∑ 2:59 AM UTC",daring,joyful,Esquizofrenia = mente dividida: Miedo a las re...,3
2588,@brigethcoba,"""Lo que m√°s miedo me da, es ver c√≥mo desaparec...","Jan 6, 2024 ¬∑ 2:55 AM UTC",daring,joyful,"""Lo que m√°s miedo me da, es ver c√≥mo desaparec...",3
2589,@nanyfrias11,Saltando de apoco ala pile sin agua xd sin mie...,"Jan 6, 2024 ¬∑ 2:54 AM UTC",daring,joyful,Saltando de apoco ala pile sin agua xd sin mie...,3


In [6]:
from transformers import pipeline, AutoTokenizer, BertTokenizer
import torch
from sklearn.model_selection import train_test_split
#Splitting data into train and validation sets
train_df, val_df = train_test_split(
    df,
    test_size=0.1,
    stratify=df["sentiment_labels"],
    random_state=42
)
#Model setup
model_name = "bert-base-multilingual-uncased"
tok = BertTokenizer.from_pretrained(model_name)

MAX_LEN = 100  # tweets are short
#Converting to tensors
def tokenize(batch):
    return tok(
        batch["clean_text"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )

train_encodings = tokenize(train_df)
val_encodings = tokenize(val_df)
# unmasker = pipeline('fill-mask', model='bert-base-multilingual-cased')
# unmasker("Hello I'm a [MASK] model.") 






Conversion to Dataset

In [7]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = SentimentDataset(train_encodings, train_df["sentiment_labels"].values)
val_dataset = SentimentDataset(val_encodings, val_df["sentiment_labels"].values)

Training

In [8]:
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=6
)   
training_args = TrainingArguments(
    output_dir="results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 33%|‚ñà‚ñà‚ñà‚ñé      | 146/438 [01:40<04:12,  1.16it/s]

{'loss': 1.7523, 'grad_norm': 4.667542457580566, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}


                                                 
 33%|‚ñà‚ñà‚ñà‚ñé      | 146/438 [01:43<04:12,  1.16it/s]

{'eval_loss': 1.7419929504394531, 'eval_runtime': 2.9889, 'eval_samples_per_second': 86.655, 'eval_steps_per_second': 5.688, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 292/438 [03:23<01:35,  1.52it/s]

{'loss': 1.7017, 'grad_norm': 14.654332160949707, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


                                                 
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 292/438 [03:26<01:35,  1.52it/s]

{'eval_loss': 1.7124170064926147, 'eval_runtime': 2.8693, 'eval_samples_per_second': 90.267, 'eval_steps_per_second': 5.925, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 438/438 [05:16<00:00,  1.32it/s]

{'loss': 1.5553, 'grad_norm': 10.495829582214355, 'learning_rate': 0.0, 'epoch': 3.0}


                                                 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 438/438 [05:19<00:00,  1.32it/s]

{'eval_loss': 1.6941702365875244, 'eval_runtime': 2.6445, 'eval_samples_per_second': 97.939, 'eval_steps_per_second': 6.428, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 438/438 [05:21<00:00,  1.36it/s]

{'train_runtime': 321.4916, 'train_samples_per_second': 21.752, 'train_steps_per_second': 1.362, 'train_loss': 1.6697709244680188, 'epoch': 3.0}





TrainOutput(global_step=438, training_loss=1.6697709244680188, metrics={'train_runtime': 321.4916, 'train_samples_per_second': 21.752, 'train_steps_per_second': 1.362, 'total_flos': 359375330134800.0, 'train_loss': 1.6697709244680188, 'epoch': 3.0})

Evaluation

In [10]:
trainer.evaluate()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17/17 [00:02<00:00,  6.77it/s]


{'eval_loss': 1.6941702365875244,
 'eval_runtime': 2.5869,
 'eval_samples_per_second': 100.12,
 'eval_steps_per_second': 6.572,
 'epoch': 3.0}