<a href="https://colab.research.google.com/github/Frorozcoloa/FinancIA/blob/main/Notebooks/0-2%20Train_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install transformers wandb torchmetrics lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [75]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import wandb
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import set_seed
from torch import nn

def SEED(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    set_seed(seed)


SEED(42)


# Configuration by model
NUM_VARAIBLES = 3
NUM_LABELS = 3
num_labels = NUM_LABELS * NUM_VARAIBLES
divice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [76]:
from transformers import ( 
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_constant_schedule_with_warmup,
)

# Configuring the model
num_labels = NUM_LABELS * NUM_VARAIBLES
model_name = "pysentimiento/robertuito-sentiment-analysis"
auto_tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, ignore_mismatched_sizes=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([9]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
values = auto_tokenizer.encode_plus("Hola mundo", return_tensors="pt")

In [78]:
values

{'input_ids': tensor([[   0, 1878, 1079,    2]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [79]:
model(values["input_ids"], values["attention_mask"])

SequenceClassifierOutput(loss=None, logits=tensor([[-0.2960,  0.1110,  0.4316,  0.4011, -0.3098,  0.1733, -0.2494, -0.0835,
         -0.1442]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [137]:
class FinanciaSentimental(Dataset):
    """This class is used to load the data and tokenize it"""
    def __init__(self, tokenizer, dataframe, columns, max_len=512):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        ## Columns to target
        self._columns = columns
        self.max_len = max_len
    
    @property
    def columns(self):
        """Return the columns to target"""
        return self._columns

    def __len__(self):
        """Return the length of the dataset"""
        return len(self.dataframe)
        
    def __getitem__(self, index):
        """Get the data at the index"""
        values = self.dataframe.iloc[index]
        text = values['text']
        label = values[self._columns].values.astype(np.float32)
        inputs = self.tokenizer.encode_plus(text, max_length=512, pad_to_max_length=True, padding='max_length', truncation=True, return_tensors='pt')
        label = torch.tensor(label, dtype=torch.float)
        input_ids = inputs["input_ids"].squeeze().to(dtype=torch.long)
        attention_mask = inputs["attention_mask"].squeeze().to(dtype=torch.long)
        
        inputs_dict = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label":label
        }

        return inputs_dict

In [116]:
from torchmetrics import Accuracy, F1Score, Precision, Recall
import lightning.pytorch as pl

class FinanciaMultilabel(pl.LightningModule):
    """This class is used to create the model"""
    def __init__(self, num_labels, model_name, class_weights):
        super().__init__()
        self.num_labels = num_labels
        # The models is multi-label, so we need to use BCEWithLogitsLoss
        self.loss = nn.BCEWithLogitsLoss(pos_weight=class_weights, reduction='none')
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=self.num_labels, ignore_mismatched_sizes=True)
        self._tokenizer = AutoTokenizer.from_pretrained(model_name)
        #Metrics for the model for training
        self.train_f1 = F1Score(task = "multilabel", num_labels=num_labels)
        self.train_accuracy = Accuracy(task ="multilabel", num_labels=num_labels)
        self.train_precision = Precision(task ="multilabel", num_labels=num_labels)
        self.train_recall = Recall(task ="multilabel", num_labels=num_labels)
        # Metrics for the model for validation
        self.val_f1 = F1Score(task = "multilabel", num_labels=num_labels)
        self.val_accuracy = Accuracy(task ="multilabel", num_labels=num_labels)
        self.val_precision = Precision(task ="multilabel", num_labels=num_labels)
        self.val_recall = Recall(task ="multilabel", num_labels=num_labels)
        
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask):
        """This function is used to forward the data through the model"""
        output = self.model(input_ids, input_ids)
        logits = output.logits
        return logits

    def training_step(self, batch, batch_idx):
        """This function is used to train the model"""
        bath = batch
        label = batch["label"]
        attention_mask = batch["attention_mask"]
        inputs_id = batch["input_ids"]
        logits = self(inputs_id, attention_mask)
        train_loss = self.loss(logits, labels)
        train_acc = self.train_accuracy(logits, labels)
        train_f1 = self.train_f1(logits, labels)
        train_score = self.train_precision(logits, labels)
        train_recall = self.train_recall(logits, labels)
        self.log("train/loss", train_loss, on_step=False, on_epoch=True)
        self.log("train/accuracy", self.train_accuracy, on_step=False, on_epoch=True)
        self.log("train/precision", self.train_precision, on_step=False, on_epoch=True)
        self.log("train/recall", self.train_recall, on_step=False, on_epoch=True)
        self.log("train/f1_score", self.train_f1_score, on_step=False, on_epoch=True)
        return train_loss

    def validation_step(self, batch, batch_idx):
        """This function is used to validate the model"""
        bath = batch
        label = batch["label"]
        attention_mask = batch["attention_mask"]
        inputs_id = batch["input_ids"]
        logits = self(inputs_id, attention_mask)
        val_loss = self.loss(logits, labels)
        val_acc = self.val_accuracy(logits, labels)
        val_f1 = self.val_f1(logits, labels)
        val_recall = self.val_recall(logits, labels)
        val_precision = self.val_precision(logits, labels)
        
        self.log("val/loss", val_loss, on_step=False, on_epoch=True)
        self.log("val/accuracy", self.val_accuracy, on_step=False, on_epoch=True)
        self.log("val/precision", self.val_precision, on_step=False, on_epoch=True)
        self.log("val/recall", self.val_recall, on_step=False, on_epoch=True)
        self.log("val/f1_score", self.val_f1_score, on_step=False, on_epoch=True)
        self.log("val/loss", val_loss, on_step=False, on_epoch=True)
        return val_loss

    def configure_optimizers(self):
        """This function is used to configure the optimizer"""
        optimizer = torch.optim.AdamW(self.parameters())


In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
train_df = "/content/drive/Shareddrives/Redes neuronales/Datasets/df_with_sentiment.csv"
df = pd.read_csv(train_df)
df = df[["id",	"text",	"target",	"target_sentiment",	"companies_sentiment",	"consumers_sentiment", "tag"]]
df = pd.get_dummies(df, columns = ["target_sentiment",	"companies_sentiment","consumers_sentiment"])
df_train = df[df.tag == "train"]
df_test = df[df.tag == "test"]
df_valid, df_test = train_test_split(df_test, test_size=0.5)

In [28]:
df_valid

Unnamed: 0,id,text,target,tag,target_sentiment_negative,target_sentiment_neutral,target_sentiment_positive,companies_sentiment_negative,companies_sentiment_neutral,companies_sentiment_positive,consumers_sentiment_negative,consumers_sentiment_neutral,consumers_sentiment_positive
820,80591,La UE advierte de las consecuencias económicas...,UE,test,1,0,0,1,0,0,1,0,0
775,72293,Empieza el juicio por los 500 seísmos que caus...,Castor,test,1,0,0,0,1,0,1,0,0
830,99766,Los ‘nuevos’ ERTE mantienen la prohibición de ...,ERTE,test,0,0,1,1,0,0,0,0,1
822,40165,Cellnex salta mañana al cuarto puesto por capi...,Cellnex,test,0,0,1,1,0,0,0,1,0
848,32667,"Stripe, el mayor 'unicornio' de Silicon Valley...",Stripe,test,0,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,7312,Así evoluciona el precio de la vivienda usada,precio de vivienda usada,test,0,1,0,0,1,0,0,1,0
842,25376,Santander aprueba la salida a Bolsa de Getnet ...,Santander,test,0,0,1,0,1,0,0,1,0
750,67465,Calviño aboga por retrasar la reforma fiscal h...,Calviño,test,0,0,1,0,1,0,1,0,0
828,310714,El consejo de Norwegian rechaza las dos oferta...,Norwegian,test,0,0,1,1,0,0,0,1,0


In [29]:
columns_varaibles = ["target_sentiment_negative",	"target_sentiment_neutral",	"target_sentiment_positive"	,"companies_sentiment_negative"	,"companies_sentiment_neutral",	"companies_sentiment_positive", 'consumers_sentiment_negative',
       'consumers_sentiment_neutral', 'consumers_sentiment_positive']

In [30]:
print(df.shape)
print(df_train.shape)

(905, 13)
(736, 13)


In [138]:
train_dataset = FinanciaSentimental(auto_tokenizer, df_train, columns_varaibles)
valid_dataset = FinanciaSentimental(auto_tokenizer, df_valid, columns_varaibles)
test_dataset = FinanciaSentimental(auto_tokenizer, df_test, columns_varaibles)

In [139]:
inputs = next(iter(train_dataset))
print(inputs.keys())

dict_keys(['input_ids', 'attention_mask', 'label'])


In [140]:
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)

In [141]:
inputs = next(iter(train_dataloader))
print(inputs.keys())


dict_keys(['input_ids', 'attention_mask', 'label'])


In [142]:
class_weight = torch.ones(9)
model = FinanciaMultilabel(9, model_name,class_weight )

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([9]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [143]:
from pytorch_lightning.loggers import WandbLogger
wandb_logger = WandbLogger(project='FinancIA', name='#1', save_code=True, log_model=False, sync_tensorboard=True, save_dir="./logs")
wandb_logger.watch(model, log="all")

  rank_zero_warn(
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


In [144]:
from  lightning.pytorch import Trainer
from  lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from  lightning.pytorch.callbacks.progress import TQDMProgressBar
from  lightning.pytorch.loggers import WandbLogger
from torch.utils.data import DataLoader, WeightedRandomSampler

checkpoint_callback = ModelCheckpoint(monitor="val/f1_score", mode="max", save_last=True, save_weights_only=True)
tqdm_callback = TQDMProgressBar(refresh_rate=1)
early_stop_callback = EarlyStopping(monitor='val_loss', patience=5)
trainer = pl.Trainer( accelerator="cuda", max_epochs=10, logger=wandb_logger, callbacks=[checkpoint_callback, tqdm_callback,early_stop_callback], precision=16,)
trainer.fit(model, train_dataloader, valid_dataloader)

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(
INFO: 
  | Name            | Type                             | Params
---------------------------------------------------------------------
0 | loss            | BCEWithLogitsLoss                | 0     
1 | model           | RobertaForSequenceClassification | 108 M 
2 | train_f1        | MultilabelF1Score                | 0     
3 | train_accuracy  | MultilabelAccuracy               | 0     
4 | train_precision | Mul

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


RuntimeError: ignored