In [1]:
import os
import re
import json
import torch
import warnings
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn.metrics import f1_score
from datasets import Dataset, DatasetDict
from typing import Optional, Union, Tuple
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import set_seed
from collections import Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoConfig, BertPreTrainedModel, BertModel,AutoModel,LlamaPreTrainedModel, RobertaPreTrainedModel,AutoModelForSequenceClassification,RobertaModel, BertForSequenceClassification, RobertaForSequenceClassification
warnings.filterwarnings("ignore")

torch.cuda.empty_cache()  
torch.cuda.ipc_collect()  
from typing import Callable, List, Optional, Tuple, Union

In [2]:
seed=42
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Este notebook contiene:
- el baseline de clasificación de sentimientos 
- la adición del contexto social con modelos (svd, deepwalk, node2vec, tadw)
- la adición haciendo uso de la concatenación con el texto original


  


## Funciones

In [None]:
#CLEAN DATA
def cleaner1(tweet):
    # remove usernames
    # tweet = re.sub("@[A-Za-z0-9]+","",tweet)
    tweet = tweet.lower()
    tweet = re.sub("^rt", "", tweet)
    tweet = re.sub("\s[0-9]+\s", "", tweet)
    # remove usernames
    tweet = re.sub("@[^\s]+", "", tweet)
    tweet = re.sub("at_user", "", tweet)
    # remove urls
    tweet = re.sub("pic.twitter.com/[A-Za-z0-9]+", "", tweet)
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)
    tweet = tweet.replace("url", "")
    tweet = tweet.strip()
    tweet = " ".join(tweet.split())
    return tweet



#BINARY LABELS SENT
def binary_labels(df):
    df= df.replace({'label': {'neg': 0, 'pos': 1}})
    id2label = {0: "NEG", 1: "POS"}
    label2id = {"NEG": 0, "POS": 1}
    return df, id2label, label2id

#MULTICLASS LABELS SENT
def multi_labels(df):
    df= df.replace({"label": {"negative": 0, "positive": 1,"neutral":2}})
    id2label = {0: "NEGATIVE", 1: "POSITIVE", 2:"NEUTRAL"}
    label2id = {"NEGATIVE": 0, "POSITIV": 1, "NEUTRAL":2}
    return df, id2label, label2id


#MULTICLASS LABELS MORAL
def label_multiclass6(df):
    df= df.replace({'label': {'care': 1, 'harm': 1,
                                'fairness': 2,'cheating': 2,
                                'loyalty': 3,'betrayal': 3,
                                'authority': 4,'subversion': 4,
                                 'purity': 5,'degradation': 5,'nonmoral': 0,'nomoral': 0
                                }})
    
    id2label = {0:"NONMORAL", 1:"CARE" ,1:"HARM",2:"FAIRNESS",2:"CHEATING",3:"LOYALTY",3:"BETRAYAL",4:"AUTHORITY",4:"SUBVERSION",5:"PURITY",5:"DEGRADATION"}
    label2id = {"NONMORAL":0, "CARE": 1,"HARM":1,"FAIRNESS":2,"CHEATING":2,"LOYALTY":3,"BETRAYAL":3,"AUTHORITY":4,"SUBVERSION":4,"PURITY":5,"DEGRADATION":5}

    return df, id2label,label2id 

def label_multiclass11(df):
    df= df.replace({'label': {'care': 1, 'harm': 2,
                                'fairness': 3,'cheating': 4,
                                'loyalty': 5,'betrayal': 6,
                                'authority': 7,'subversion': 8,
                                 'purity': 9,'degradation': 10,'nonmoral': 0,'no moral': 0,'nomoral': 0
                                }})
    
    id2label = {0:"NONMORAL", 1:"CARE" ,2:"HARM",3:"FAIRNESS",4:"CHEATING",5:"LOYALTY",6:"BETRAYAL",7:"AUTHORITY",8:"SUBVERSION",9:"PURITY",10:"DEGRADATION"}
    label2id = {"NONMORAL":0, "CARE": 1,"HARM":2,"FAIRNESS":3,"CHEATING":4,"LOYALTY":5,"BETRAYAL":6,"AUTHORITY":7,"SUBVERSION":8,"PURITY":9,"DEGRADATION":10}

    return df, id2label,label2id 
    
# METRICS
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    #print(predictions)
    predictions = np.argmax(predictions, axis=1)
    #print(predictions)
    #print(labels)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall}


# TOKENIZER
def tokenize_function(examples):
    tokenized_inputs=tokenizer(examples["text"], truncation=True)
    return tokenized_inputs



In [None]:
#BERT MODEL (TEXT EMBEDDING + USER EMBEDDING)
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoConfig, BertPreTrainedModel, BertModel,AutoModel, AutoModelForSequenceClassification, BertForSequenceClassification
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
class BertClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, num_extra_dims):
        super().__init__()
        total_dims = config.hidden_size + num_extra_dims 
        self.dense = nn.Linear(total_dims, total_dims)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(total_dims, config.num_labels) 


    def forward(self, features, **kwargs):
        x = self.dropout(features)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x
class CustomBertSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, num_extra_dims):
        # Initialize the base model (BERT)
        super().__init__(config)
        self.config = config
        total_dims = config.hidden_size + num_extra_dims 
        self.bert = BertModel(config)
        
        #classifier
        self.num_labels = config.num_labels
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        #self.classifier = nn.Linear(total_dims, config.num_labels)
        self.classifier = BertClassificationHead(config, num_extra_dims)

        self.config = config
        self.num_extra_dims = num_extra_dims
        self.post_init()


    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        extra_data: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        # Get output from the BERT model (transformer)
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        sequence_output = outputs.pooler_output
        sequence_output = self.dropout(sequence_output)
     
        if extra_data is not None:
            output = torch.cat((sequence_output, extra_data), dim=-1)
        else:
            output = sequence_output

        logits = self.classifier(output)
        
        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
        
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        """
        Override `from_pretrained` to handle `num_extra_dims` when loading a pre-trained model.
        """
        num_extra_dims = kwargs.pop("num_extra_dims", 0)  
        config = kwargs.pop("config", None)

        if config is None:
            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

        config.num_extra_dims = num_extra_dims  

        return super().from_pretrained(
            pretrained_model_name_or_path,
            *model_args,
            config=config,
            num_extra_dims=num_extra_dims,  
            **kwargs
        )

In [None]:
#ROBERTA MODEL (TEXT EMBEDDING + USER EMBEDDING)
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoConfig, AutoModel,RobertaPreTrainedModel,AutoModelForSequenceClassification,RobertaModel,RobertaForSequenceClassification
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, num_extra_dims):
        super().__init__()
        total_dims = config.hidden_size + num_extra_dims  
        self.dense = nn.Linear(total_dims, total_dims)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(total_dims, config.num_labels) 


    def forward(self, features, **kwargs):
        x = self.dropout(features)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class CustomRobertaForSequenceClassification(RobertaPreTrainedModel):
    def __init__(self, config, num_extra_dims):
        super().__init__(config)
        self.config = config
        self.num_extra_dims = num_extra_dims 
        total_dims = config.hidden_size + num_extra_dims 
        
        # classifier 
        self.num_labels = config.num_labels

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.classifier = RobertaClassificationHead(config, num_extra_dims)

        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        extra_data: Optional[torch.FloatTensor] = None,  
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        cls_output = sequence_output[:, 0, :] 

        if extra_data is not None:
            cls_output = torch.cat((cls_output, extra_data), dim=-1)

        logits = self.classifier(cls_output)
            
        loss = None
        if labels is not None:
            # move labels to correct device to enable model parallelism
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        """
        Override `from_pretrained` to handle `num_extra_dims` when loading a pre-trained model.
        """
        num_extra_dims = kwargs.pop("num_extra_dims", 0)  
        config = kwargs.pop("config", None)

        if config is None:
            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

        config.num_extra_dims = num_extra_dims  

        return super().from_pretrained(
            pretrained_model_name_or_path,
            *model_args,
            config=config,
            num_extra_dims=num_extra_dims,  
            **kwargs
        )


In [None]:
#LLAMA MODEL (TEXT EMBEDDING + USER EMBEDDING)
from typing import Optional, Tuple, List, Union
from transformers import LlamaPreTrainedModel, LlamaModel, AutoConfig
from transformers.modeling_outputs import SequenceClassifierOutputWithPast

class CustomLlamaForSequenceClassification(LlamaPreTrainedModel):
    def __init__(self, config, num_extra_dims):
        super().__init__(config)
        self.config = config
        self.num_labels = 6
        self.num_extra_dims = num_extra_dims 
        total_dims = config.hidden_size + num_extra_dims 
        self.config.pad_token_id = self.config.eos_token_id  

        self.model = LlamaModel(config)
        self.score = nn.Linear(config.hidden_size + num_extra_dims, self.num_labels, bias=False)

        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        extra_data: Optional[torch.FloatTensor] = None,  
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        hidden_states = transformer_outputs[0]
        if extra_data is not None:
            extra_data = extra_data.unsqueeze(1)  # Reshape to (batch_size, 1, num_extra_dims)
            extra_data = extra_data.expand(-1, hidden_states.shape[1], -1)  # Expand to (batch_size, sequence_length, num_extra_dims)
            hidden_states = torch.cat((hidden_states, extra_data), dim=-1)
            pooled_representation = hidden_states[:, -1, :]# Shape: (batch_size, sequence_length, hidden_size + num_extra_dims)
            logits = self.score(pooled_representation.to(self.score.weight.dtype))
            #print(f"Logits shape: {logits.shape}")

        else:
            logits = self.score(hidden_states)
            #print(f"Logits shape: {logits.shape}")

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]
        
        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        
        # Determine the last non-pad token position (for padding handling)
        if self.config.pad_token_id is None:
            last_non_pad_token = -1
        elif input_ids is not None:
            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
        else:
            last_non_pad_token = -1
        #print(f"Last non-pad token indices: {last_non_pad_token}")
        
        if extra_data is None:
            pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
        else:
            pooled_logits = logits    
        #print(f"Pooled logits shape before checking: {pooled_logits.shape}")

        loss = None
        if labels is not None:
            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)

        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        """
        Override `from_pretrained` to handle `num_extra_dims` when loading a pre-trained model.
        """
        num_extra_dims = kwargs.pop("num_extra_dims", 0)  
        config = kwargs.pop("config", None)

        if config is None:
            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

        config.num_extra_dims = num_extra_dims  

        return super().from_pretrained(
            pretrained_model_name_or_path,
            *model_args,
            config=config,
            num_extra_dims=num_extra_dims,  
            **kwargs
        )


In [None]:
#DEEPSEEPK Model
from typing import Optional, Tuple, List, Union
from transformers import DeepseekV3PreTrainedModel, DeepseekV3Model, DeepseekV3Config
from transformers.modeling_outputs import SequenceClassifierOutputWithPast
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)



class CustomDeepseekV3ForSequenceClassification(DeepseekV3PreTrainedModel):
    def __init__(self, config,num_extra_dims):
        super().__init__(config)
        self.config = config
        self.num_labels = 2
        self.num_extra_dims = num_extra_dims 
        total_dims = config.hidden_size + num_extra_dims
        self.config.pad_token_id = self.config.eos_token_id  
        
        self.model = DeepseekV3Model(config)
        self.score = nn.Linear(total_dims, self.num_labels, bias=False)

        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        extra_data: Optional[torch.FloatTensor] = None,  
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        
        if extra_data is not None:
            extra_data = extra_data.unsqueeze(1)  
            extra_data = extra_data.expand(-1, hidden_states.shape[1], -1)  
            hidden_states = torch.cat((hidden_states, extra_data), dim=-1)
            pooled_representation = hidden_states[:, -1, :]
            logits = self.score(pooled_representation.to(self.score.weight.dtype))
            #print(f"Logits shape: {logits.shape}")

        else:
            logits = self.score(hidden_states)
            #print(f"Logits shape: {logits.shape}")

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError(
                "Cannot handle batch sizes > 1 if no padding token is defined."
            )
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = (
                    torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                ).to(logits.device)
            else:
                sequence_lengths = -1
        if extra_data is None:
            pooled_logits = logits[
                torch.arange(batch_size, device=logits.device), sequence_lengths
            ]
        else:
            pooled_logits = logits    

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(
                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
                )
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: str,
        *model_args,
        **kwargs
    ):
        num_extra_dims = kwargs.pop("num_extra_dims", 0)
        config = kwargs.pop("config", None)

        if config is None:
            config = DeepseekV3Config.from_pretrained(pretrained_model_name_or_path, **kwargs)

        config.num_extra_dims = num_extra_dims

        return super().from_pretrained(
            pretrained_model_name_or_path,
            *model_args,
            config=config,
            num_extra_dims=num_extra_dims,
            **kwargs,
        )

In [None]:
#QWEN2 
from typing import Optional, Tuple, List, Union
#from transformers.models.deepseek_v3 import *
from transformers import Qwen2PreTrainedModel, Qwen2Model, Qwen2Config
from transformers.modeling_outputs import SequenceClassifierOutputWithPast
from transformers.cache_utils import Cache
class CustomQwen2ForSequenceClassification(Qwen2PreTrainedModel):
    def __init__(self, config,num_extra_dims):
        super().__init__(config)
        self.config = config
        self.num_labels = config.num_labels
        self.num_extra_dims = num_extra_dims
        total_dims = config.hidden_size + num_extra_dims
        self.config.pad_token_id = self.config.eos_token_id  
        
        self.model = Qwen2Model(config)
        self.score = nn.Linear(total_dims,self.num_labels, bias=False)

        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value


    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        extra_data: Optional[torch.FloatTensor] = None,  
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
    ) -> SequenceClassifierOutputWithPast:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        transformer_outputs: BaseModelOutputWithPast = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )
        hidden_states = transformer_outputs.last_hidden_state
        
        if extra_data is not None:
            extra_data = extra_data.unsqueeze(1)  
            extra_data = extra_data.expand(-1, hidden_states.shape[1], -1)  
            hidden_states = torch.cat((hidden_states, extra_data), dim=-1)
            pooled_representation = hidden_states[:, -1, :]
            logits = self.score(pooled_representation.to(self.score.weight.dtype))
            #print(f"Logits shape: {logits.shape}")

        else:
            logits = self.score(hidden_states)
            #print(f"Logits shape: {logits.shape}")

            
        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            last_non_pad_token = -1
        elif input_ids is not None:
            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
        else:
            last_non_pad_token = -1
            logger.warning_once(
                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
            )

        if extra_data is None:
            pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
        else:
            pooled_logits = logits
            
        loss = None
        if labels is not None:
            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

        
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        """
        Override `from_pretrained` to handle `num_extra_dims` when loading a pre-trained model.
        """
        num_extra_dims = kwargs.pop("num_extra_dims", 0) 
        config = kwargs.pop("config", None)

        if config is None:
            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

        config.num_extra_dims = num_extra_dims  

        return super().from_pretrained(
            pretrained_model_name_or_path,
            *model_args,
            config=config,
            num_extra_dims=num_extra_dims,              **kwargs
        )


## **Fine Tuning (TEXTO + CS) Bert Roberta**

In [None]:
### CONTEXTO SOCIAL DATASET 1 (POZZI)

model_name='roberta-base'
extra_dims=32

tweets = pd.read_pickle("../data/OMC/final_omc_morality.pkl")
tweets['text']=tweets['text'].map(cleaner1)
tweets['extra_data']=tweets['deepwalk']
tweets['label']=tweets['moral_label2']


print(tweets.label.unique())
tweets.label.value_counts()

['nonmoral' 'authority' 'loyalty' 'purity' 'care' 'fairness']
['nonmoral' 'authority' 'loyalty' 'purity' 'care' 'fairness']


Unnamed: 0,tweet.id,user.id,dataset.author.name,label,text,moralbert,roberta_mmp,roberta_mm,liwc_mfd,persp_bert,moral_label,moral_label_polarity,svd,deepwalk,node2vec,tadw,moral_label2,extra_data
0,936469851,10323542,drgilpin,nonmoral,watching by myself #tweetdebate not drinking :...,nonmoral,nomoral,no moral,no moral,nonmoral,nonmoral,no moral,"[-0.9321853169454714, -0.014290048491863475, -...","[-0.43476266, 0.4637962, 1.0409873, -0.2093216...","[-0.27371866, -0.19295375, -0.37779337, 0.6663...","[-0.15936547082592128, 0.23132371423577064, -0...",nonmoral,"[-0.43476266, 0.4637962, 1.0409873, -0.2093216..."
1,936470432,11752272,starweaver,nonmoral,"yeah, slime was actually my second choice, can...",nonmoral,nomoral,no moral,sanctity.vice,loyalty,nonmoral,no moral,"[-2.3407145936409695, 0.29875694807026504, 1.1...","[-2.061748, 0.2598792, -0.2871443, -1.006268, ...","[0.077606544, 0.21957941, -0.21372578, 0.02413...","[-0.13507492718699363, 0.22315517862073722, -0...",nonmoral,"[-2.061748, 0.2598792, -0.2871443, -1.006268, ..."
2,936472030,716543,kyeung808,nonmoral,preparing to have a heart attack #tweetdebate,degradation,nomoral,no moral,care.vice,nonmoral,nonmoral,degradation,"[-0.4696246816045738, 0.011188949802830955, -0...","[-0.06279127, 0.038863122, 0.12867297, 0.29538...","[0.43654078, 0.22683652, 0.2081876, -0.0945255...","[-0.1729719604261917, 0.19232023246154403, -0....",nonmoral,"[-0.06279127, 0.038863122, 0.12867297, 0.29538..."
3,936472042,14759482,rebot,nonmoral,"no debate moderators under 50, sorry #tweetdebate",nonmoral,nomoral,no moral,no moral,nonmoral,nonmoral,no moral,"[-0.4748503994385164, 0.16479432293878057, -0....","[1.3948498, 0.08554719, -0.14505437, -1.072701...","[0.31162417, 0.17204364, 0.14859162, 0.2996835...","[-0.09415871666324986, 0.22696396302001293, -0...",nonmoral,"[1.3948498, 0.08554719, -0.14505437, -1.072701..."
4,936472907,6035262,Karoli,nonmoral,now staring at black screen on grrrrrrrrrrrrrr...,nonmoral,nomoral,no moral,no moral,fairness,nonmoral,no moral,"[-0.9144128643032162, 0.09508063320665167, -0....","[-0.35507488, -0.33618665, 0.93690664, 2.07505...","[-0.15241532, 0.32681182, 0.1377003, 0.2165719...","[-0.13134790549379227, 0.23772991422683537, -0...",nonmoral,"[-0.35507488, -0.33618665, 0.93690664, 2.07505..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1762,936727686,5752932,crysharris,nonmoral,you missed the bit where mccain talked over ob...,nonmoral,nomoral,no moral,fairness.vice,authority,nonmoral,no moral,"[-0.17597257154484572, -0.0780247591016859, 0....","[0.94359154, -2.4664073, -0.352156, 0.9408136,...","[-0.2640397, -0.0024188492, -0.2750953, 0.8507...","[-0.19121226784674555, 0.12374432476479204, -0...",nonmoral,"[0.94359154, -2.4664073, -0.352156, 0.9408136,..."
1763,936731119,812825,MandianaJones,nonmoral,has hashtagged the debate #debate08. same tag ...,nonmoral,nomoral,no moral,no moral,fairness,nonmoral,no moral,"[-2.225933705819569, 0.53964464069966, -0.7008...","[-0.3015819, -0.7410528, 0.46232414, 0.0280605...","[0.23174001, -0.27921534, 0.2237174, 0.6746251...","[-0.1838567057770515, 0.2297698874173019, -0.2...",nonmoral,"[-0.3015819, -0.7410528, 0.46232414, 0.0280605..."
1764,936732650,666913,Autumm,nonmoral,#debate08 fox says mccain won the debate. msnb...,nonmoral,nomoral,no moral,no moral,nonmoral,nonmoral,no moral,"[-1.030078339024446, 0.22364630412766384, -0.9...","[-0.013444448, -1.1800328, -0.1991519, 2.45609...","[0.20520344, 0.29482052, -0.14640543, -0.16537...","[-0.18213415458960597, 0.22508570453273133, -0...",nonmoral,"[-0.013444448, -1.1800328, -0.1991519, 2.45609..."
1765,936733111,7769402,Laurie2,nonmoral,"#debate08 did your favorite candidate ""beat"" t...",nonmoral,cheating,no moral,no moral,fairness,nonmoral,no moral,"[-0.10846916464087902, 0.02237691547738228, -0...","[0.86933273, 1.3993437, -1.373814, -0.7129098,...","[0.1895242, 0.26178938, -0.05714034, 0.1517531...","[-0.020512648254196127, 0.23753842475034861, -...",nonmoral,"[0.86933273, 1.3993437, -1.373814, -0.7129098,..."


In [None]:
#args
training_args = TrainingArguments(
    output_dir = '/model/',
    learning_rate=2e-5,
    num_train_epochs=10,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    eval_strategy = "epoch",
    push_to_hub=False,
    save_strategy='no',
    seed=42)


#data
#df, id2label, label2id = binary_labels(tweets)

#df, id2label, label2id = multi_labels(tweets)

df, id2label, label2id = label_multiclass6(tweets)

#label_5_rows = df[df['label'] == 5]
#index_data = list(label_5_rows.index[0:2])
#selected_rows = df.loc[index_data]
#df = df.drop(index_data)
#selected_rows
df.label.unique()

array([0, 4, 3, 5, 1, 2])

In [9]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
#test_df = pd.concat([test_df, selected_rows]).reset_index(drop=True)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
datasets = {'train': Dataset.from_pandas(train_df), 'val': Dataset.from_pandas(val_df), 'test': Dataset.from_pandas(test_df)}
datasets = DatasetDict(datasets)
test_df.label.value_counts()

label
0    299
4     22
1     18
2      9
3      5
5      1
Name: count, dtype: int64

In [10]:
model_name="roberta-base"
model_name

'roberta-base'

In [11]:
# Load configuration
if model_name == "bert-base-uncased":

    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', truncation=True)
    tokenized_datasets = datasets.map(tokenize_function, batched=True)

    config = AutoConfig.from_pretrained('bert-base-uncased', 
        num_labels=2, 
        id2label=id2label, 
        label2id=label2id
    )
    
    new_model = CustomBertSequenceClassification.from_pretrained(
        'bert-base-uncased',  
        config=config,
        num_extra_dims=extra_dims)


elif model_name == "roberta-base":
    
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", truncation=True)
    tokenized_datasets = datasets.map(tokenize_function, batched=True)

    config = AutoConfig.from_pretrained(
        "FacebookAI/roberta-base",
        num_labels=6,
        id2label=id2label,
        label2id=label2id
    )

    new_model = CustomRobertaForSequenceClassification.from_pretrained(
        "FacebookAI/roberta-base",
        config=config,
        num_extra_dims=extra_dims
    )
trainer = Trainer(
    model=new_model,           
    args=training_args,              
    train_dataset=tokenized_datasets['train'],  
    eval_dataset=tokenized_datasets['val'],     
    tokenizer=tokenizer,             
    compute_metrics=compute_metrics  
)

trainer.train()

#pred
predictions = trainer.predict(tokenized_datasets['test'])
predicted_class_ids = predictions.predictions.argmax(axis=1)
actual_labels = tokenized_datasets['test']['label']
results = classification_report(actual_labels, predicted_class_ids, digits=5, output_dict=True)

results


Map:   0%|          | 0/1130 [00:00<?, ? examples/s]

Map:   0%|          | 0/283 [00:00<?, ? examples/s]

Map:   0%|          | 0/354 [00:00<?, ? examples/s]

Some weights of CustomRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.56715,0.865724,0.154672,0.144287,0.166667
2,No log,0.529777,0.865724,0.154672,0.144287,0.166667
3,No log,0.539804,0.840989,0.275146,0.278696,0.277675
4,No log,0.586189,0.844523,0.248262,0.335481,0.234941
5,No log,0.604537,0.85159,0.251147,0.294452,0.236302
6,No log,0.669093,0.844523,0.326771,0.448663,0.327451
7,No log,0.702249,0.826855,0.360269,0.404391,0.365036
8,0.446800,0.717164,0.823322,0.285883,0.305792,0.279955
9,0.446800,0.736695,0.837456,0.383174,0.376711,0.392911
10,0.446800,0.716722,0.844523,0.385614,0.379123,0.394272


In [None]:
#save results
import json
results_file = '../data/MIND/F1_results/bert-base-sentiment2'
experiment = "baseline"
dataset_name= 'pozzi'

if os.path.exists(results_file):
    with open(results_file, 'r') as f:
        data = json.load(f)
else:
    data = {}

new_experiment = {
    'experiment': experiment,
    'dataset': dataset_name,
    'results': results
}

if 'experiments' not in data:
    data['experiments'] = []

data['experiments'].append(new_experiment)

with open(results_file, 'w') as f:
    json.dump(data, f, indent=2)

print(f"Training complete. Results for experiment {experiment} added to {results_file}")

Training complete. Results for experiment baseline added to ../data/MIND/F1_results/bert-base-sentiment2


In [None]:
#save results

results_file = '../data/OMC/F1_results/roberta-base-moral-polarity'
experiment='tadw'

with open(results_file, "a") as f:
    f.write(f"\nExperimento: {experiment}\n")
    f.write(f"\nDataset OMC: 1\n")
    f.write(json.dumps(results, indent=2))

print("Training complete. Results saved in", results_file)

results

## **Llama Fine Tuning (TEXTO + CS)**

In [8]:
import peft
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    TrainerCallback, TrainerState, TrainerControl, 
    AutoModelForSequenceClassification,
    LlamaForSequenceClassification,
    EarlyStoppingCallback
)
import torch
from peft import LoraConfig, TaskType, PeftModel
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

device = torch.device("cuda")
assert torch.cuda.is_available()

In [9]:
#CONTEXTO SOCIAL DATASET1 (POZZI)
model_name = "meta-llama/Llama-3.2-1B"
extra_dims=80

tweets = pd.read_pickle("../data/OMC/final_omc_morality.pkl")
tweets['text']=tweets['text'].map(cleaner1)
tweets['extra_data']=tweets['tadw']
tweets['label']=tweets['moral_label2']


print(tweets.label.unique())
tweets.label.value_counts()

['nonmoral' 'authority' 'loyalty' 'purity' 'care' 'fairness']


label
nonmoral     1499
authority      97
care           69
fairness       52
loyalty        39
purity         11
Name: count, dtype: int64

In [None]:
#args
truncation = True
max_length = 2000

training_args = TrainingArguments(
    output_dir = "out/",
    learning_rate=2e-4,
    num_train_epochs=10,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    weight_decay=0.01,
    eval_strategy = "epoch",
    push_to_hub=False,
    save_strategy='epoch',
    #save_safetensors=True,
    #load_best_model_at_end = True,
    #report_to="none",
)

#data
#df, id2label, label2id = binary_labels(tweets)
df, id2label, label2id= label_multiclass6(tweets)

#df, id2label, label2id = multi_labels(tweets)
#label_5_rows = df[df['label'] == 5]
#index_data = list(label_5_rows.index[0:2])
#selected_rows = df.loc[index_data]
#df = df.drop(index_data)
#selected_rows
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
#test_df = pd.concat([test_df, selected_rows]).reset_index(drop=True)

datasets = {'train': Dataset.from_pandas(train_df), 'val': Dataset.from_pandas(val_df), 'test': Dataset.from_pandas(test_df)}
datasets = DatasetDict(datasets)
tokenizer = AutoTokenizer.from_pretrained(model_name)
#selected_rows
test_df.label.value_counts()

label
0    299
4     22
1     18
2      9
3      5
5      1
Name: count, dtype: int64

In [11]:
# Model Name
model_name = "meta-llama/Llama-3.2-1B"
# Quantization Config
quantization_config = BitsAndBytesConfig(load_in_8bit=True, bnb_4bit_compute_dtype=torch.float16)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="SEQ_CLS",
)


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = "[PAD]"

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=6,
    id2label=id2label,
    label2id=label2id
)

new_model = CustomLlamaForSequenceClassification.from_pretrained(
    model_name,
    config=config,
    quantization_config=quantization_config,
    low_cpu_mem_usage=True,
    num_extra_dims=extra_dims,  
)
new_model.config.pad_token_id = new_model.config.eos_token_id


Some weights of CustomLlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
#train
tokenizer.pad_token_id = new_model.config.pad_token_id
tokenized_datasets = datasets.map(tokenize_function, batched=True)
new_model.add_adapter(peft_config, adapter_name="adapter_1")

trainer = Trainer(
    model=new_model,           
    args=training_args,              
    train_dataset=tokenized_datasets['train'],  
    eval_dataset=tokenized_datasets['val'],     
    tokenizer=tokenizer,             
    compute_metrics=compute_metrics,
)

trainer.train()


Map:   0%|          | 0/1130 [00:00<?, ? examples/s]

Map:   0%|          | 0/283 [00:00<?, ? examples/s]

Map:   0%|          | 0/354 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.632324,0.865724,0.154672,0.144287,0.166667
2,No log,0.621094,0.865724,0.154672,0.144287,0.166667
3,No log,0.589355,0.865724,0.154672,0.144287,0.166667
4,0.680400,0.551758,0.865724,0.182404,0.311388,0.181138
5,0.680400,0.618164,0.855124,0.179242,0.227818,0.179097
6,0.680400,0.576172,0.816254,0.250366,0.306909,0.235181
7,0.680400,0.60791,0.862191,0.202153,0.256719,0.194929
8,0.349800,0.627441,0.855124,0.202006,0.270985,0.193568
9,0.349800,0.657715,0.848057,0.199481,0.243056,0.192208
10,0.349800,0.683594,0.85159,0.219587,0.280754,0.207359


In [None]:
#pred
predictions = trainer.predict(tokenized_datasets['test'])
predicted_class_ids = predictions.predictions.argmax(axis=1)
actual_labels = tokenized_datasets['test']['label']
results = classification_report(actual_labels, predicted_class_ids, digits=5, output_dict=True)

results

In [None]:
#save results
import json
results_file = '../data/MIND/F1_results/llama-3.2-1b-morality'
experiment = "baseline"
dataset_name= 'hcr'

if os.path.exists(results_file):
    with open(results_file, 'r') as f:
        data = json.load(f)
else:
    data = {}

new_experiment = {
    'experiment': experiment,
    'dataset': dataset_name,
    'results': results
}

if 'experiments' not in data:
    data['experiments'] = []

# Añadimos el nuevo experimento
data['experiments'].append(new_experiment)

with open(results_file, 'w') as f:
    json.dump(data, f, indent=2)

print(f"Training complete. Results for experiment {experiment} added to {results_file}")

In [None]:
#save results

import json
results_file = '../data/HCR/F1_results/llama-3.2-1b-moral'
experiment= "baseline"

with open(results_file, "a") as f:
    f.write(f"\nExperimento: {experiment}\n")
    f.write(f"\nDataset HCR: \n")
    f.write(json.dumps(results, indent=2))

print("Training complete. Results saved in", results_file)

results

## DeepSeek Fine Tunning


In [None]:
import peft
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    TrainerCallback, TrainerState, TrainerControl, 
    AutoModelForSequenceClassification,
    LlamaForSequenceClassification,
    EarlyStoppingCallback
)
import torch
from peft import LoraConfig, TaskType, PeftModel
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

device = torch.device("cuda")
assert torch.cuda.is_available()

In [3]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
print(config.model_type)  

qwen2


In [None]:
#CONTEXTO SOCIAL DATASET1 (POZZI)
model_name= "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
extra_dims=0
tweets = pd.read_pickle("../data/MIND/final_omc_morality.pkl")
tweets['text']=tweets['text'].map(cleaner1)
tweets['extra_data']=tweets['tadw']
tweets['label']=tweets['moral_label']


print(tweets.label.unique())
tweets.label.value_counts()

In [None]:
#args
truncation = True
max_length = 2000

training_args = TrainingArguments(
    output_dir = "out/",
    learning_rate=2e-4,
    num_train_epochs=10,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    weight_decay=0.01,
    eval_strategy = "epoch",
    push_to_hub=False,
    save_strategy='epoch',
    #save_safetensors=True,
    #load_best_model_at_end = True,
    #report_to="none",
)

#data
#df, id2label, label2id = binary_labels(tweets)

df, id2label, label2id= label_multiclass6(tweets)

#df, id2label, label2id = multi_labels(tweets)
#label_5_rows = df[df['label'] == 5]
#index_data = list(label_5_rows.index[0:2])
#selected_rows = df.loc[index_data]
#df = df.drop(index_data)
#selected_rows
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
#test_df = pd.concat([test_df, selected_rows]).reset_index(drop=True)

datasets = {'train': Dataset.from_pandas(train_df), 'val': Dataset.from_pandas(val_df), 'test': Dataset.from_pandas(test_df)}
datasets = DatasetDict(datasets)
tokenizer = AutoTokenizer.from_pretrained(model_name)
#selected_rows
test_df.label.value_counts()

In [None]:
# Model Name
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
# Quantization Config
quantization_config = BitsAndBytesConfig(load_in_8bit=True, bnb_4bit_compute_dtype=torch.float16)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="SEQ_CLS",
)


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = "[PAD]"

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=6,
    id2label=id2label,
    label2id=label2id
)

new_model = CustomQwen2ForSequenceClassification.from_pretrained(
    model_name,
    config=config,
    quantization_config=quantization_config,
    low_cpu_mem_usage=True,
    num_extra_dims=extra_dims,  
)
new_model.config.pad_token_id = new_model.config.eos_token_id


In [None]:
#train
tokenizer.pad_token_id = new_model.config.pad_token_id
tokenized_datasets = datasets.map(tokenize_function, batched=True)
new_model.add_adapter(peft_config, adapter_name="adapter_1")

trainer = Trainer(
    model=new_model,           
    args=training_args,              
    train_dataset=tokenized_datasets['train'],  
    eval_dataset=tokenized_datasets['val'],     
    tokenizer=tokenizer,             
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
#pred
predictions = trainer.predict(tokenized_datasets['test'])
predicted_class_ids = predictions.predictions.argmax(axis=1)
actual_labels = tokenized_datasets['test']['label']
results = classification_report(actual_labels, predicted_class_ids, digits=5, output_dict=True)

results

In [None]:
#save results
import json
results_file = '../data/OMC/F1_results/DeepSeek-Qwen-morality'
experiment= "tadw"

with open(results_file, "a") as f:
    f.write(f"\nExperimento: {experiment}\n")
    f.write(f"\nDataset OMC: \n")
    f.write(json.dumps(results, indent=2))

print("Training complete. Results saved in", results_file)
