<a href="https://colab.research.google.com/github/Frorozcoloa/FinancIA/blob/main/Notebooks/0-2%20Train_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install transformers wandb torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.4


In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import wandb
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import set_seed

def SEED(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    set_seed(seed)
SEED(42)


# Configuration by model
NUM_VARAIBLES = 3
NUM_LABELS = 3
num_labels = NUM_LABELS * NUM_VARAIBLES
model_name = "pysentimiento/roberta-es-sentiment"

divice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
from transformers import ( 
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_constant_schedule_with_warmup,
)

# Configuring the model
num_labels = NUM_LABELS * NUM_VARAIBLES
# model_name = "pysentimiento/roberta-es-sentiment"
model_name = "pysentimiento/robertuito-sentiment-analysis"
auto_tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, ignore_mismatched_sizes=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/925 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/435M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([9]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class FinanciaSentimental(Dataset):
    """This class is used to load the data and tokenize it"""
    def __init__(self, tokenizer, data, type):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        ## Columns to target
        self._columns = ["target_sentiment", "companies_sentiment", "consumers_sentiment"]
    
    @property
    def columns(self):
        """Return the columns to target"""
        return self._columns

    def __len__(self):
        """Return the length of the dataset"""
        return self.dataframe.count()
        
    def __getitem__(self, index):
        """Get the data at the index"""
        values = self.dataframe.iloc[index]
        text = values['text']
        label = pd.get_dummies(values[self._columns], columns=[self._columns]).values.astype(np.int8)
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_attention_mask=True,
            return_tensors='pt'
        )
        label = torch.tensor(label, dtype=torch.int8)
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0), 

In [None]:
from torchmetrics import Accuracy, F1, Precision, Recall

class FinanciaMultilabel(pl.LightningModule):
    """This class is used to create the model"""
    def __init__(self, num_labels, model_name, class_weights):
        super().__init__()
        self.num_labels = num_labels
        # The models is multi-label, so we need to use BCEWithLogitsLoss
        self.loss = nn.BCEWithLogitsLoss(pos_weight=class_weights, reduction='none')
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=self.num_labels, ignore_mismatched_sizes=True)
        self._tokenizer = AutoTokenizer.from_pretrained(model_name)
        #Metrics for the model for training
        self.train_f1 = F1(task = "multilabel", num_classes=self.num_labels)
        self.train_accuracy = Accuracy(task ="multilabel", num_classes=self.num_labels)
        self.train_precision = Precision(task ="multilabel", num_classes=self.num_labels)
        self.train_recall = Recall(task ="multilabel", num_classes=self.num_labels)
        # Metrics for the model for validation
        self.val_f1 = F1(task = "multilabel", num_classes=self.num_labels)
        self.val_accuracy = Accuracy(task ="multilabel", num_classes=self.num_labels)
        self.val_precision = Precision(task ="multilabel", num_classes=self.num_labels)
        self.val_recall = Recall(task ="multilabel", num_classes=self.num_labels)
        
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask):
        """This function is used to forward the data through the model"""
        output = self.model(input_ids, attention_mask=attention_mask)
        logits = output.logits
        return logits

    def training_step(self, batch, batch_idx):
        """This function is used to train the model"""
        input_ids, attention_mask, labels = batch
        logits = self(input_ids, attention_mask)
        train_loss = self.loss(logits, labels)
        train_acc = self.train_accuracy(logits, labels)
        train_f1 = self.train_f1(logits, labels)
        train_score = self.train_precision(logits, labels)
        train_recall = self.train_recall(logits, labels)
        self.log("train/loss", loss, on_step=False, on_epoch=True)
        self.log("train/accuracy", self.train_accuracy, on_step=False, on_epoch=True)
        self.log("train/precision", self.train_precision, on_step=False, on_epoch=True)
        self.log("train/recall", self.train_recall, on_step=False, on_epoch=True)
        self.log("train/f1_score", self.train_f1_score, on_step=False, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        """This function is used to validate the model"""
        input_ids, attention_mask, labels = batch
        logits = self(input_ids, attention_mask)
        val_loss = self.loss(logits, labels)
        val_acc = self.val_accuracy(outputs, labels)
        val_f1 = self.val_f1(outputs, labels)
        val_recall = self.val_recall(outputs, labels)
        val_precision = self.val_precision(outputs, labels)
        
        self.log("val/loss", val_loss, on_step=False, on_epoch=True)
        self.log("val/accuracy", self.val_accuracy, on_step=False, on_epoch=True)
        self.log("val/precision", self.val_precision, on_step=False, on_epoch=True)
        self.log("val/recall", self.val_recall, on_step=False, on_epoch=True)
        self.log("val/f1_score", self.val_f1_score, on_step=False, on_epoch=True)
        self.log("val/loss", val_loss, on_step=False, on_epoch=True)
        return val_loss

    def configure_optimizers(self):
        """This function is used to configure the optimizer"""
        optimizer = torch.optim.AdamW
