<a href="https://colab.research.google.com/github/Frorozcoloa/FinancIA/blob/main/Notebooks/0-2%20Train_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers wandb torchmetrics lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.14.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchmetrics
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lightning
  Downloading lightning-2.0.1.post0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-non

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import wandb
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import set_seed
from torch import nn

def SEED(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    set_seed(seed)


SEED(42)


# Configuration by model
NUM_VARAIBLES = 3
NUM_LABELS = 3
num_labels = NUM_LABELS * NUM_VARAIBLES
model_name = "pysentimiento/roberta-es-sentiment"

divice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from transformers import ( 
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_constant_schedule_with_warmup,
)

# Configuring the model
num_labels = NUM_LABELS * NUM_VARAIBLES
# model_name = "pysentimiento/roberta-es-sentiment"
model_name = "pysentimiento/robertuito-sentiment-analysis"
auto_tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, ignore_mismatched_sizes=True)

In [None]:
auto_tokenizer.encode_plus("Hola mundo")

In [None]:
class FinanciaSentimental(Dataset):
    """This class is used to load the data and tokenize it"""
    def __init__(self, tokenizer, dataframe, columns):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        ## Columns to target
        self._columns = columns
    
    @property
    def columns(self):
        """Return the columns to target"""
        return self._columns

    def __len__(self):
        """Return the length of the dataset"""
        return len(self.dataframe)
        
    def __getitem__(self, index):
        """Get the data at the index"""
        values = self.dataframe.iloc[index]
        text = values['text']
        label = values[self._columns].values.astype(np.float32)
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_attention_mask=True,
            return_tensors='pt'
        )
        label = torch.tensor(label, dtype=torch.float32)
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0), label

In [None]:
from torchmetrics import Accuracy, F1Score, Precision, Recall
import lightning.pytorch as pl

class FinanciaMultilabel(pl.LightningModule):
    """This class is used to create the model"""
    def __init__(self, num_labels, model_name, class_weights):
        super().__init__()
        self.num_labels = num_labels
        # The models is multi-label, so we need to use BCEWithLogitsLoss
        self.loss = nn.BCEWithLogitsLoss(pos_weight=class_weights, reduction='none')
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=self.num_labels, ignore_mismatched_sizes=True)
        self._tokenizer = AutoTokenizer.from_pretrained(model_name)
        #Metrics for the model for training
        self.train_f1 = F1Score(task = "multilabel", num_labels=num_labels)
        self.train_accuracy = Accuracy(task ="multilabel", num_labels=num_labels)
        self.train_precision = Precision(task ="multilabel", num_labels=num_labels)
        self.train_recall = Recall(task ="multilabel", num_labels=num_labels)
        # Metrics for the model for validation
        self.val_f1 = F1Score(task = "multilabel", num_labels=num_labels)
        self.val_accuracy = Accuracy(task ="multilabel", num_labels=num_labels)
        self.val_precision = Precision(task ="multilabel", num_labels=num_labels)
        self.val_recall = Recall(task ="multilabel", num_labels=num_labels)
        
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask):
        """This function is used to forward the data through the model"""
        output = self.model(input_ids, attention_mask=attention_mask)
        logits = output.logits
        return logits

    def training_step(self, batch, batch_idx):
        """This function is used to train the model"""
        input_ids, attention_mask, labels = batch
        logits = self(input_ids, attention_mask)
        train_loss = self.loss(logits, labels)
        train_acc = self.train_accuracy(logits, labels)
        train_f1 = self.train_f1(logits, labels)
        train_score = self.train_precision(logits, labels)
        train_recall = self.train_recall(logits, labels)
        self.log("train/loss", train_loss, on_step=False, on_epoch=True)
        self.log("train/accuracy", self.train_accuracy, on_step=False, on_epoch=True)
        self.log("train/precision", self.train_precision, on_step=False, on_epoch=True)
        self.log("train/recall", self.train_recall, on_step=False, on_epoch=True)
        self.log("train/f1_score", self.train_f1_score, on_step=False, on_epoch=True)
        return train_loss

    def validation_step(self, batch, batch_idx):
        """This function is used to validate the model"""
        input_ids, attention_mask, labels = batch
        logits = self(input_ids, attention_mask)
        val_loss = self.loss(logits, labels)
        val_acc = self.val_accuracy(logits, labels)
        val_f1 = self.val_f1(logits, labels)
        val_recall = self.val_recall(logits, labels)
        val_precision = self.val_precision(logits, labels)
        
        self.log("val/loss", val_loss, on_step=False, on_epoch=True)
        self.log("val/accuracy", self.val_accuracy, on_step=False, on_epoch=True)
        self.log("val/precision", self.val_precision, on_step=False, on_epoch=True)
        self.log("val/recall", self.val_recall, on_step=False, on_epoch=True)
        self.log("val/f1_score", self.val_f1_score, on_step=False, on_epoch=True)
        self.log("val/loss", val_loss, on_step=False, on_epoch=True)
        return val_loss

    def configure_optimizers(self):
        """This function is used to configure the optimizer"""
        optimizer = torch.optim.AdamW(self.parameters())


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train_df = "/content/drive/Shareddrives/Redes neuronales/Datasets/df_with_sentiment.csv"
df = pd.read_csv(train_df)
df = df[["id",	"text",	"target",	"target_sentiment",	"companies_sentiment",	"consumers_sentiment", "tag"]]
df = pd.get_dummies(df, columns = ["target_sentiment",	"companies_sentiment","consumers_sentiment"])
df_train = df[df.tag == "train"]
df_test = df[df.tag == "test"]
df_valid, df_test = train_test_split(df_test, test_size=0.5)

In [None]:
df_valid

In [None]:
columns_varaibles = ["target_sentiment_negative",	"target_sentiment_neutral",	"target_sentiment_positive"	,"companies_sentiment_negative"	,"companies_sentiment_neutral",	"companies_sentiment_positive", 'consumers_sentiment_negative',
       'consumers_sentiment_neutral', 'consumers_sentiment_positive']

In [None]:
print(df.shape)
print(df_train.shape)

In [None]:
train_dataset = FinanciaSentimental(auto_tokenizer, df_train, columns_varaibles)
valid_dataset = FinanciaSentimental(auto_tokenizer, df_valid, columns_varaibles)
test_dataset = FinanciaSentimental(auto_tokenizer, df_test, columns_varaibles)

In [None]:
input, attention, mask = next(iter(train_dataset))
print(input.shape)
print(attention.shape)
print(mask.shape)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [None]:
input, attention, mask = next(iter(train_dataloader))
print(input.shape)
print(attention.shape)
print(mask.shape)

In [None]:
class_weight = torch.ones(9)
model = FinanciaMultilabel(9, model_name,class_weight )

In [None]:
from pytorch_lightning.loggers import WandbLogger
wandb_logger = WandbLogger(project='FinancIA', name='#1', save_code=True, log_model=False, sync_tensorboard=True, save_dir="./logs")
wandb_logger.watch(model, log="all")

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.callbacks.progress import TQDMProgressBar
from pytorch_lightning.loggers import WandbLogger
from torch.utils.data import DataLoader, WeightedRandomSampler

checkpoint_callback = ModelCheckpoint(monitor="val/f1_score", mode="max", save_last=True, save_weights_only=True)
tqdm_callback = TQDMProgressBar(refresh_rate=1)
early_stop_callback = EarlyStopping(monitor='val_loss', patience=5)
trainer = pl.Trainer( accelerator="cuda", max_epochs=10, logger=wandb_logger, callbacks=[checkpoint_callback, tqdm_callback,early_stop_callback], precision=16,)
trainer.fit(model, train_dataloader, valid_dataloader)