In [1]:
import os
import json
import torch
import warnings
import torch
import random
import optuna
import re
import numpy as np
import pandas as pd
import torch.nn as nn
import networkx as nx
from sklearn.metrics import f1_score
from datasets import Dataset, DatasetDict
from typing import Optional, Union, Tuple
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import set_seed
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoConfig, AutoModel, BertPreTrainedModel, RobertaPreTrainedModel, RobertaModel, BertModel,AutoModelForSequenceClassification, BertForSequenceClassification, RobertaForSequenceClassification
warnings.filterwarnings("ignore")
import torch

torch.cuda.empty_cache()  # Libera memoria en caché
torch.cuda.ipc_collect()  # Recoge memoria inaccesible
# Clear memory in Python
import gc
gc.collect()


95

In [2]:
import peft
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    # https://github.com/huggingface/peft/issues/96#issuecomment-1460080427
    TrainerCallback, TrainerState, TrainerControl, 
    AutoModelForSequenceClassification,
    LlamaForSequenceClassification,
    EarlyStoppingCallback
)
import torch
from peft import LoraConfig, TaskType, PeftModel
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

device = torch.device("cuda")
assert torch.cuda.is_available()

In [3]:
seed=42
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Este notebook contiene:
- la implementación de optuna para buscar los hiperparámetros del clasificador
- optuna usa optimizacion bayesiana (usa un modelo probabilistico para buscar valores que tengan mas probabilidad de mejorar el modelo basándose en resultados de haber aplicado valores anteriores)
- Usa el método de optimización bayesiana TPE (Tree-structured Parzen Estimator),  que divide experimentos previos en grupos que dan buenos y malos resultados, no modela función de búsqueda, crea dos distribuciones de probabilidad y busca en la región donde haya mas probabilidad de mejorar el rendimiento del modelo
- optuna equilibra exploración/explotación

Conceptos:
- trial: es un intento de optuna por encontrar buena combinación de hiperparámetros
- en cada trial genera combinación de hiperparámetros(n cadas, n neuronas, dropout), entrena y evalúa

- experimento: conjunto de trials o intentos ejecutados durante la optimización (proceso completo de encontrar le mejor combinación de hiperparámetros)
- en el experimento se gestioan los trials, se realiza el seguimiento de las evaluaciones y se selecciona la mejor combinación de hiperparámetros

- 15 trials, combinaciones independientes
- 10 experimentos, 10 mejores combinaciones de esas 15 trials en cada uno
- al final de cada experimento se queda con el que se haya obtenido mayor f1 score


In [4]:
def cleaner1(tweet):
    # remove usernames
    # tweet = re.sub("@[A-Za-z0-9]+","",tweet)
    tweet = tweet.lower()
    tweet = re.sub("^rt", "", tweet)
    tweet = re.sub("\s[0-9]+\s", "", tweet)

    # remove usernames
    tweet = re.sub("@[^\s]+", "", tweet)
    tweet = re.sub("at_user", "", tweet)


    # remove urls
    tweet = re.sub("pic.twitter.com/[A-Za-z0-9]+", "", tweet)
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)
    tweet = tweet.replace("url", "")

    tweet = tweet.strip()
    tweet = " ".join(tweet.split())

    return tweet

# Optuna

In [None]:
#model names: "meta-llama/Llama-3.2-1B", "bert-base-uncased" , "roberta-base"

model_name = "roberta-base"

#BINARY LABELS
def binary_labels(df):
    df= df.replace({'label': {'neg': 0, 'pos': 1}})
    id2label = {0: "NEG", 1: "POS"}
    label2id = {"NEG": 0, "POS": 1}
    return df, id2label, label2id

# METRICS
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall}

# TOKENIZER
def tokenize_function(examples):
    tokenized_inputs=tokenizer(examples["text"], truncation=True)
    return tokenized_inputs


#https://discuss.huggingface.co/t/combine-bertforsequenceclassificaion-with-additional-features/6523/2
#MODELS


#dimensión del contexto social

tweets = pd.read_pickle('../models/svd_df.pkl')
#tweets
#tweets = pd.read_pickle('../models/deepwalk_df.pkl')
#tweets
#tweets = pd.read_pickle('../models/node2vec_df.pkl')
#tweets
#tweets = pd.read_pickle('../models/tadw_df.pkl')

tweets['text']=tweets['text'].map(cleaner1)
extra_dims = 32

tweets.head()




In [None]:
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

class CustomBertSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, num_extra_dims,trial):
        # Initialize the base model (BERT)
        super().__init__(config)
        self.config = config
        total_dims = config.hidden_size + num_extra_dims 
        self.bert = BertModel(config)
        
        #classifier
        self.num_labels = config.num_labels
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        #self.classifier = nn.Linear(total_dims, config.num_labels)
        self.config = config
        self.num_extra_dims = num_extra_dims

        layers = []
        #numero de capas que se van a definir, optuna elige entre 1 y 3
        num_layers = trial.suggest_int("num_layers", 1, 3) 
        in_features = total_dims
        print(in_features)
        for i in range(num_layers):
            #numero de neuronas que se van a definir en cada capa, elige un número entre 32 y 256
            #log true permite que buscar tener más probabilidad de elegir valores pequeños 
            out_features = trial.suggest_int(f"n_units_l{i}", 32, 256, log=True) if trial else 128
            #print(out_features)
            layers.append(nn.Linear(in_features, out_features))
            # valor de droppout elegido, elegirá valor decimal entre 0.1 y 0.5
            layers.append(nn.Dropout(trial.suggest_float(f"dropout_l{i}", 0.1, 0.5) if trial else 0.2))
            in_features = out_features
            #print(in_features, out_features)
        
        layers.append(nn.Linear(in_features, config.num_labels))  
        self.classifier = nn.Sequential(*layers)  
        self.post_init()
    

    
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        extra_data: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        # Get output from the BERT model (transformer)
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        sequence_output = outputs.pooler_output
        sequence_output = self.dropout(sequence_output)
     
        if extra_data is not None:
            output = torch.cat((sequence_output, extra_data), dim=-1)
        else:
            output = sequence_output

        logits = self.classifier(output)
        
        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
        
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        """
        Override `from_pretrained` to handle `num_extra_dims` when loading a pre-trained model.
        """
        num_extra_dims = kwargs.pop("num_extra_dims", 0)  # Extract num_extra_dims
        config = kwargs.pop("config", None)

        if config is None:
            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

        config.num_extra_dims = num_extra_dims  # Inject extra_dims into config

        return super().from_pretrained(
            pretrained_model_name_or_path,
            *model_args,
            config=config,
            num_extra_dims=num_extra_dims,  # Pass it explicitly
            **kwargs
        )

In [None]:
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, num_extra_dims,trial):
        super().__init__()
        total_dims = config.hidden_size + num_extra_dims  # Correctly add extra dimensions
        classifier_dropout = (config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob)
        self.dense = nn.Linear(total_dims, total_dims)
        self.dropout = nn.Dropout(classifier_dropout)


        layers = []
        #numero de capas que se van a definir, optuna elige entre 1 y 3
        num_layers = trial.suggest_int("num_layers", 1, 3) 
        in_features = total_dims
        print(in_features)
        for i in range(num_layers):
            #numero de neuronas que se van a definir en cada capa, elige un número entre 32 y 256
            #log true permite que buscar tener más probabilidad de elegir valores pequeños 
            out_features = trial.suggest_int(f"n_units_l{i}", 32, 256, log=True) if trial else 128
            #print(out_features)
            layers.append(nn.Linear(in_features, out_features))
            # valor de droppout elegido, elegirá valor decimal entre 0.1 y 0.5
            layers.append(nn.ReLU())  # Se agregó activación ReLU para mejorar aprendizaje

            layers.append(nn.Dropout(trial.suggest_float(f"dropout_l{i}", 0.1, 0.5) if trial else 0.2))
            in_features = out_features
            #print(in_features, out_features)
        
        layers.append(nn.Linear(in_features, config.num_labels))  
        #self.classifier = nn.Sequential(*layers)  

        
        #classifier_dropout = (config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob)
        #self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Sequential(*layers) # Match total_dims


    def forward(self, features, **kwargs):
        x = self.dropout(features)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class CustomRobertaForSequenceClassification(RobertaPreTrainedModel):
    def __init__(self, config, num_extra_dims,trial):
        super().__init__(config)
        self.config = config
        self.num_extra_dims = num_extra_dims 
        total_dims = config.hidden_size + num_extra_dims 
        
        # classifier 
        self.num_labels = config.num_labels

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.classifier = RobertaClassificationHead(config, num_extra_dims,trial)

        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        extra_data: Optional[torch.FloatTensor] = None,  
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        cls_output = sequence_output[:, 0, :] 

        if extra_data is not None:
            cls_output = torch.cat((cls_output, extra_data), dim=-1)

        logits = self.classifier(cls_output)
            
        loss = None
        if labels is not None:
            # move labels to correct device to enable model parallelism
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        """
        Override `from_pretrained` to handle `num_extra_dims` when loading a pre-trained model.
        """
        num_extra_dims = kwargs.pop("num_extra_dims", 0)  # Extract num_extra_dims
        config = kwargs.pop("config", None)

        if config is None:
            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

        config.num_extra_dims = num_extra_dims  # Inject extra_dims into config

        return super().from_pretrained(
            pretrained_model_name_or_path,
            *model_args,
            config=config,
            num_extra_dims=num_extra_dims,  # Pass it explicitly
            **kwargs
        )


In [None]:
import torch
import torch.nn as nn
from typing import Optional, Tuple, List, Union
from transformers import LlamaPreTrainedModel, LlamaModel, AutoConfig
from transformers.modeling_outputs import SequenceClassifierOutputWithPast
from transformers.utils import logging

class CustomLlamaForSequenceClassification(LlamaPreTrainedModel):
    def __init__(self, config, num_extra_dims,trial):
        super().__init__(config)
        self.config = config
        self.num_labels = 2
        self.num_extra_dims = num_extra_dims
        total_dims = config.hidden_size + num_extra_dims
        self.config.pad_token_id = self.config.eos_token_id  # Asegurar que el padding sea el token de fin de secuencia

        self.model = LlamaModel(config)

        # Construcción de la capa de clasificación con Optuna
        layers = []
        num_layers = trial.suggest_int("num_layers", 1, 3) if trial else 2  # Default: 2 capas si no hay trial
        in_features = total_dims

        for i in range(num_layers):
            out_features = trial.suggest_int(f"n_units_l{i}", 32, 256, log=True) if trial else 128
            layers.append(nn.Linear(in_features, out_features))
            #layers.append(nn.ReLU())  # Se agregó activación ReLU para mejorar aprendizaje
            layers.append(nn.Dropout(trial.suggest_float(f"dropout_l{i}", 0.1, 0.5) if trial else 0.2))
            in_features = out_features

        layers.append(nn.Linear(in_features, self.num_labels))
        self.score = nn.Sequential(*layers)

        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        extra_data: Optional[torch.FloatTensor] = None,  
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        hidden_states = transformer_outputs[0]
        if extra_data is not None:
            extra_data = extra_data.unsqueeze(1)  # Reshape to (batch_size, 1, num_extra_dims)
            extra_data = extra_data.expand(-1, hidden_states.shape[1], -1)  # Expand to (batch_size, sequence_length, num_extra_dims)
            hidden_states = torch.cat((hidden_states, extra_data), dim=-1)
            pooled_representation = hidden_states[:, -1, :]# Shape: (batch_size, sequence_length, hidden_size + num_extra_dims)
            logits = self.score(pooled_representation.to(self.score.weight.dtype))
            #print(f"Logits shape: {logits.shape}")

        else:
            logits = self.score(hidden_states)
            print(f"Logits shape: {logits.shape}")

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]
        
        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        
        # Determine the last non-pad token position (for padding handling)
        if self.config.pad_token_id is None:
            last_non_pad_token = -1
        elif input_ids is not None:
            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
        else:
            last_non_pad_token = -1
        #print(f"Last non-pad token indices: {last_non_pad_token}")
        
        if extra_data is None:
            pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
        else:
            pooled_logits = logits    
        #print(f"Pooled logits shape before checking: {pooled_logits.shape}")

        loss = None
        if labels is not None:
            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)

        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        """
        Override `from_pretrained` to handle `num_extra_dims` when loading a pre-trained model.
        """
        num_extra_dims = kwargs.pop("num_extra_dims", 0)  # Extract num_extra_dims
        config = kwargs.pop("config", None)

        if config is None:
            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

        config.num_extra_dims = num_extra_dims  # Inject extra_dims into config

        return super().from_pretrained(
            pretrained_model_name_or_path,
            *model_args,
            config=config,
            num_extra_dims=num_extra_dims,  # Pass it explicitly
            **kwargs
        )


In [None]:
#data

df, id2label, label2id = binary_labels(tweets)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
datasets = {'train': Dataset.from_pandas(train_df), 'val': Dataset.from_pandas(val_df), 'test': Dataset.from_pandas(test_df)}


datasets = DatasetDict(datasets)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
model_name

In [None]:
#model names: "meta-llama/Llama-3.2-1B", "bert-base-uncased" , "roberta-base"

def objective(trial):
    if model_name == "meta-llama/Llama-3.2-1B":
        truncation = True
        max_length = 2000
        tokenized_datasets = datasets.map(tokenize_function, batched=True)
        

        # Quantization Config
        quantization_config = BitsAndBytesConfig(load_in_8bit=True, bnb_4bit_compute_dtype=torch.float16)
        
        peft_config = LoraConfig(
            lora_alpha=16,
            lora_dropout=0.1,
            r=64,
            bias="none",
            task_type="SEQ_CLS",
        )
        
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = "[PAD]"
        
        
        
        
        config = AutoConfig.from_pretrained(
            model_name,
            num_labels=2,
            id2label=id2label,
            label2id=label2id
        )
        
        # Load Custom Model with Quantization
        model = CustomLlamaForSequenceClassification.from_pretrained(
            model_name,
            config=config,
            quantization_config=quantization_config,
            low_cpu_mem_usage=True,
            num_extra_dims=extra_dims,  trial=trial
        ).to(torch.float32)




        
        training_args = TrainingArguments(
            output_dir = "out/",
            learning_rate=2e-4,
            num_train_epochs=10,
            per_device_train_batch_size = 1,
            per_device_eval_batch_size = 1,
            weight_decay=0.01,
            eval_strategy = "epoch",
            push_to_hub=False,
            save_strategy='no',
            #save_safetensors=True,
            #load_best_model_at_end = True,
            #report_to="none",
        )

        
    elif model_name == "bert-base-uncased":

        tokenized_datasets = datasets.map(tokenize_function, batched=True)
    
        config = AutoConfig.from_pretrained('bert-base-uncased', num_labels=2, 
            id2label=id2label, 
            label2id=label2id
        )
        
        model = CustomBertSequenceClassification.from_pretrained(
            'bert-base-uncased',  
            config=config,
            num_extra_dims=extra_dims, trial=trial)
    
        training_args = TrainingArguments(
            output_dir = '/model/',
            learning_rate=2e-5,
            num_train_epochs=10,
            per_device_train_batch_size = 16,
            per_device_eval_batch_size=16,
            weight_decay=0.01,
            eval_strategy = "epoch",
            push_to_hub=False,
            save_strategy='no',
            seed=42)

    
        
    elif model_name == "roberta-base":
        tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
        tokenized_datasets = datasets.map(tokenize_function, batched=True)
    
        config = AutoConfig.from_pretrained(
            "FacebookAI/roberta-base",
            num_labels=2,
            id2label=id2label,
            label2id=label2id
        )
    
        model = CustomRobertaForSequenceClassification.from_pretrained(
            "FacebookAI/roberta-base",
            config=config,
            num_extra_dims=extra_dims, trial=trial
        )
                
        training_args = TrainingArguments(
            output_dir = '/model/',
            learning_rate=2e-5,
            num_train_epochs=10,
            per_device_train_batch_size = 16,
            per_device_eval_batch_size=16,
            weight_decay=0.01,
            eval_strategy = "epoch",
            push_to_hub=False,
            save_strategy='no',
            seed=42)
        


    trainer = Trainer(
        model=model,           
        args=training_args,              
        train_dataset=tokenized_datasets['train'],  
        eval_dataset=tokenized_datasets['val'],     
        tokenizer=tokenizer,             
        compute_metrics=compute_metrics  
    )

    trainer.train()

    #para evaluar el estudio se usan los datos de evaluación se usa el valor de f1 score
    eval_result = trainer.evaluate()
    return eval_result["eval_f1"]  
    
# se indica que se quiere maximizar el f1 score
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print("Best trial:", study.best_trial)


In [None]:
best_params = study.best_trial

print("Mejores hiperparámetros encontrados:", best_params)

# Test

In [5]:
#model names: "meta-llama/Llama-3.2-1B", "bert-base-uncased" , "roberta-base"
model_name = "roberta-base"


def binary_labels(df):
    df= df.replace({'label': {'neg': 0, 'pos': 1}})
    id2label = {0: "NEG", 1: "POS"}
    label2id = {"NEG": 0, "POS": 1}
    return df, id2label, label2id


#tweets = pd.read_pickle('../models/svd_df.pkl')
#tweets
#tweets = pd.read_pickle('../models/deepwalk_df.pkl')
#tweets
#tweets = pd.read_pickle('../models/node2vec_df.pkl')
#tweets
tweets = pd.read_pickle('../models/tadw_df.pkl')
tweets['text']=tweets['text'].map(cleaner1)
extra_dims = 80

In [6]:
# METRICS
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall}

# TOKENIZER
def tokenize_function(examples):
    tokenized_inputs=tokenizer(examples["text"], truncation=True)
    return tokenized_inputs


from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

class CustomBertSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, num_extra_dims):
        # Initialize the base model (BERT)
        super().__init__(config)
        self.config = config
        total_dims = config.hidden_size + num_extra_dims 
        self.bert = BertModel(config)
        
        #classifier
        self.num_labels = config.num_labels
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        '''
        # Aplicar los mejores hiperparámetros encontrados
        self.num_layers = 1
        self.dropout_prob0 = 0.4308857828688666
        self.n_units_l0 = 221
        

        layers = []
        in_features = total_dims
        layers.append(nn.Linear(in_features, self.n_units_l0))
        layers.append(nn.Dropout(self.dropout_prob0))
        layers.append(nn.Linear(self.n_units_l0, config.num_labels))  
        self.classifier = nn.Sequential(*layers)
        self.post_init()

        '''
        # Aplicar los mejores hiperparámetros encontrados
        self.num_layers = 3
        self.dropout_prob0 = 0.23176856907840251
        self.n_units_l0 = 54
        self.dropout_prob1 = 0.18476776697400715
        self.n_units_l1 = 85
        self.dropout_prob2 = 0.382516054666877
        self.n_units_l2 = 122

        # Define layers
        layers = []
        in_features = total_dims
        layers.append(nn.Linear(in_features, self.n_units_l0))
        layers.append(nn.Dropout(self.dropout_prob0))
        layers.append(nn.Linear(self.n_units_l0, self.n_units_l1)) 
        layers.append(nn.Dropout(self.dropout_prob1))
        layers.append(nn.Linear(self.n_units_l1, config.num_labels))
        #layers.append(nn.Dropout(self.dropout_prob2))
        #layers.append(nn.Linear(self.n_units_l2, config.num_labels))
        self.classifier = nn.Sequential(*layers)
        self.post_init()
        


    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        extra_data: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        # Get output from the BERT model (transformer)
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        sequence_output = outputs.pooler_output
        sequence_output = self.dropout(sequence_output)
     
        if extra_data is not None:
            output = torch.cat((sequence_output, extra_data), dim=-1)
        else:
            output = sequence_output

        logits = self.classifier(output)

        
        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
        
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        """
        Override `from_pretrained` to handle `num_extra_dims` when loading a pre-trained model.
        """
        num_extra_dims = kwargs.pop("num_extra_dims", 0)  # Extract num_extra_dims
        config = kwargs.pop("config", None)

        if config is None:
            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

        config.num_extra_dims = num_extra_dims  # Inject extra_dims into config

        return super().from_pretrained(
            pretrained_model_name_or_path,
            *model_args,
            config=config,
            num_extra_dims=num_extra_dims,  # Pass it explicitly
            **kwargs
        )



In [7]:
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, num_extra_dims):
        super().__init__()
        total_dims = config.hidden_size + num_extra_dims  # Correctly add extra dimensions
        classifier_dropout = (config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob)
        self.dense = nn.Linear(total_dims, total_dims)
        self.dropout = nn.Dropout(classifier_dropout)


        self.num_layers = 3
        self.dropout_prob0 = 0.2902443815942271
        self.n_units_l0 = 203
        self.dropout_prob1 = 0.21294012560852782
        self.n_units_l1 = 134
        self.dropout_prob2 = 0.2101716211711512
        self.n_units_l2 = 239

        # Define layers
        layers = []
        in_features = total_dims
        layers.append(nn.Linear(in_features, self.n_units_l0))
        layers.append(nn.Dropout(self.dropout_prob0))
        layers.append(nn.Linear(self.n_units_l0, self.n_units_l1)) 
        layers.append(nn.Dropout(self.dropout_prob1))
        layers.append(nn.Linear(self.n_units_l1, self.n_units_l2))
        layers.append(nn.Dropout(self.dropout_prob2))
        layers.append(nn.Linear(self.n_units_l2, config.num_labels))
        #self.classifier = nn.Sequential(*layers)  

        
        #classifier_dropout = (config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob)
        #self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Sequential(*layers) # Match total_dims


    def forward(self, features, **kwargs):
        x = self.dropout(features)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class CustomRobertaForSequenceClassification(RobertaPreTrainedModel):
    def __init__(self, config, num_extra_dims):
        super().__init__(config)
        self.config = config
        self.num_extra_dims = num_extra_dims 
        total_dims = config.hidden_size + num_extra_dims 
        
        # classifier 
        self.num_labels = config.num_labels

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.classifier = RobertaClassificationHead(config, num_extra_dims)

        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        extra_data: Optional[torch.FloatTensor] = None,  
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        cls_output = sequence_output[:, 0, :] 

        if extra_data is not None:
            cls_output = torch.cat((cls_output, extra_data), dim=-1)

        logits = self.classifier(cls_output)
            
        loss = None
        if labels is not None:
            # move labels to correct device to enable model parallelism
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        """
        Override `from_pretrained` to handle `num_extra_dims` when loading a pre-trained model.
        """
        num_extra_dims = kwargs.pop("num_extra_dims", 0)  # Extract num_extra_dims
        config = kwargs.pop("config", None)

        if config is None:
            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

        config.num_extra_dims = num_extra_dims  # Inject extra_dims into config

        return super().from_pretrained(
            pretrained_model_name_or_path,
            *model_args,
            config=config,
            num_extra_dims=num_extra_dims,  # Pass it explicitly
            **kwargs
        )


In [8]:
#data

df, id2label, label2id = binary_labels(tweets)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
datasets = {'train': Dataset.from_pandas(train_df), 'val': Dataset.from_pandas(val_df), 'test': Dataset.from_pandas(test_df)}


datasets = DatasetDict(datasets)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_name='roberta-base'

In [9]:

if model_name=="meta-llama/Llama-3.2-1B":
    truncation = True
    max_length = 2000

    quantization_config = BitsAndBytesConfig(load_in_8bit=True, bnb_4bit_compute_dtype=torch.float16)
    
    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="SEQ_CLS",
    )
        
    new_model = CustomLlamaForSequenceClassification(
        model_name=model_name,
        quantization_config=quantization_config,
        peft_config=peft_config,
        num_extra_dims=extra_dims,
        num_labels=2)
    
    tokenizer.pad_token = "[PAD]"
    tokenizer.pad_token_id = new_model.config.pad_token_id
    tokenized_datasets = datasets.map(tokenize_function, batched=True)
    
    new_model.config.pad_token_id = new_model.config.eos_token_id
    
    training_args = TrainingArguments(
        output_dir = "out/",
        learning_rate=2e-4,
        num_train_epochs=10,
        per_device_train_batch_size = 1,
        per_device_eval_batch_size = 1,
        weight_decay=0.01,
        eval_strategy = "epoch",
        push_to_hub=False,
        save_strategy='no',
        #save_safetensors=True,
        #load_best_model_at_end = True,
        #report_to="none",
    )

elif model_name == "bert-base-uncased":
    tokenized_datasets = datasets.map(tokenize_function, batched=True)

    config = AutoConfig.from_pretrained('bert-base-uncased', num_labels=2, 
        id2label=id2label, 
        label2id=label2id
    )
    
    new_model = CustomBertSequenceClassification.from_pretrained(
        'bert-base-uncased',  
        config=config,
        num_extra_dims=extra_dims) 
    #args
    training_args = TrainingArguments(
        output_dir = '/model/',
        learning_rate=2e-5,
        num_train_epochs=10,
        per_device_train_batch_size = 16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        eval_strategy = "epoch",
        push_to_hub=False,
        save_strategy='no',
        seed=42)
    
    

elif model_name =="roberta-base":
    tokenized_datasets = datasets.map(tokenize_function, batched=True)


    config = AutoConfig.from_pretrained('roberta-base', num_labels=2, 
        id2label=id2label, 
        label2id=label2id
    )
    
    new_model = CustomRobertaForSequenceClassification.from_pretrained(
        'roberta-base',  
        config=config,
        num_extra_dims=extra_dims
    )
    #args
    training_args = TrainingArguments(
        output_dir = '/model/',
        learning_rate=2e-5,
        num_train_epochs=10,
        per_device_train_batch_size = 16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        eval_strategy = "epoch",
        push_to_hub=False,
        save_strategy='no',
        seed=42)



trainer = Trainer(
        model=new_model,           
        args=training_args,              
        train_dataset=tokenized_datasets['train'],  
        eval_dataset=tokenized_datasets['val'],     
        tokenizer=tokenizer,             
        compute_metrics=compute_metrics  
)

trainer.train()


Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Some weights of CustomRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.0.bias', 'classifier.out_proj.0.weight', 'classifier.out_proj.2.bias', 'classifier.out_proj.2.weight', 'classifier.out_proj.4.bias', 'classifier.out_proj.4.weight', 'classifier.out_proj.6.bias', 'classifier.out_proj.6.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.693127,0.5,0.333333,0.25,0.5
2,No log,0.693104,0.5,0.333333,0.25,0.5
3,No log,0.693062,0.5,0.333333,0.25,0.5
4,No log,0.692874,0.5,0.333333,0.25,0.5
5,No log,0.691817,0.5,0.333333,0.25,0.5
6,No log,0.692261,0.5,0.333333,0.25,0.5
7,No log,0.690629,0.5,0.333333,0.25,0.5
8,No log,0.691562,0.5,0.333333,0.25,0.5
9,No log,0.691639,0.5,0.333333,0.25,0.5
10,No log,0.691594,0.5,0.333333,0.25,0.5


TrainOutput(global_step=70, training_loss=0.6867175510951451, metrics={'train_runtime': 7.6218, 'train_samples_per_second': 132.515, 'train_steps_per_second': 9.184, 'total_flos': 19897866696240.0, 'train_loss': 0.6867175510951451, 'epoch': 10.0})

In [10]:

predictions = trainer.predict(tokenized_datasets['test'])
predicted_class_ids = predictions.predictions.argmax(axis=1)
actual_labels = tokenized_datasets['test']['label']
results = classification_report(actual_labels, predicted_class_ids, digits=5, output_dict=True) 
results

{'0': {'precision': 0.53125,
  'recall': 1.0,
  'f1-score': 0.6938775510204082,
  'support': 17.0},
 '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15.0},
 'accuracy': 0.53125,
 'macro avg': {'precision': 0.265625,
  'recall': 0.5,
  'f1-score': 0.3469387755102041,
  'support': 32.0},
 'weighted avg': {'precision': 0.2822265625,
  'recall': 0.53125,
  'f1-score': 0.36862244897959184,
  'support': 32.0}}

In [12]:
new_model

CustomRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
   