In [None]:
import torch
import pickle
import pandas as pd
import numpy as np
import transformers
import transformers
import ast
import json
import torch.nn as nn
from sklearn import metrics
from sklearn.model_selection import KFold
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer, BertModel, BertConfig, RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, matthews_corrcoef

Para la tarea de clasificación en inglés, nos enfocamos en estrategias de enriquecimiento de datos, aprovechando las capacidades de autoatención de los modelos transformers, utilizando exclusivamente los encoders de arquitecturas como BERT y RoBERTa.

Preprocesamiento de Datos:
Se limpiaron los conjuntos de entrenamiento y prueba eliminando elementos como URLs para facilitar la interpretación correcta de los textos.
Este preprocesamiento se aplicó tanto a datos en inglés como en español, asegurando consistencia en ambos conjuntos.
Manejo del Desequilibrio de Clases:
Se identificó un desequilibrio significativo en los datos, con una clara subrepresentación de la clase CONSPIRACY.
Para abordar este problema, se implementó una estrategia de oversampling, pero con un enfoque diferenciador: los textos originales fueron traducidos a otro idioma utilizando modelos preentrenados de Hugging Face y luego traducidos de vuelta al idioma original. Esta técnica generó datos similares pero no idénticos, mitigando riesgos de overfitting.
Adición de Información Contextual:
Dada la subjetividad de la tarea, se exploraron estrategias para enriquecer los datos con información adicional. Inicialmente, se consideró utilizar modelos preentrenados para extraer características textuales como emociones, pero esta opción se descartó debido a su dependencia del sesgo del modelo seleccionado.
Optamos por incorporar léxicos en inglés (emocionales y morales) que relacionan palabras clave en los textos con emociones y valores morales. La información extraída de estos léxicos se incluyó como un prompt añadido al final del texto original, indicando qué emoción y valor moral reflejaba el contenido.
Experimentos y Ajustes:
Se realizaron múltiples experimentos para identificar el modelo preentrenado más efectivo (comparando entre BERT base uncased y RoBERTa), evaluar el impacto del aumento de datos y ajustar hiperparámetros clave.
También se exploraron configuraciones avanzadas, como el aumento del tamaño de batch, para mejorar los resultados del modelo.
Resultados y Métricas:
Los resultados fueron evaluados utilizando métricas como F1-score macro y micro, y el coeficiente MCC, según lo solicitado en la competencia original.
El mejor desempeño se obtuvo con el modelo RoBERTa, en combinación con el aumento de datos mediante traducciones y la adición del contexto emocional y moral extraído de los léxicos. Este enfoque superó el baseline de la competencia PAN.
Experimentación Adicional:
Como parte de la exploración de técnicas avanzadas, modificamos la estructura base de RoBERTa añadiendo capas de atención múltiple y convoluciones, (siguiendo un enfoque basado en el paper Cheruku, R. et al. (2023) Sentiment classification with modified Roberta and recurrent neural networks - multimedia tools and applications, SpringerLink. Available at: https://link.springer.com/article/10.1007/s11042-023-16833-5 (Accessed: 22 November 2024) . Aunque no se lograron mejoras significativas, este experimento permitió aprender cómo alterar arquitecturas para tareas con alta subjetividad, como clasificar emociones y añadir metadatos textuales.
También se realizó un experimento combinando las representaciones [CLS] de BERT y RoBERTa. Los embeddings de ambas arquitecturas (768 dimensiones cada uno) fueron concatenados para formar un vector de 1536 dimensiones, que se introdujo en una capa totalmente conectada para la clasificación binaria

# Datos

In [None]:
df = pd.read_csv('data/train/dataset_en_train_completed.csv')
df-value_counts()

# Datos Aumentados

In [None]:
df= pd.read_csv('data/train/dataset_en_train_completed.csv')
df= df.iloc[:4000]

df_augmented= pd.read_csv('data/train/dataset_en_train_completed.csv')
df_augmented= df_augmented.iloc[4000:]
df_augmented

In [None]:
original_counts = df['category'].value_counts()
df_conspiracy = df[df['category'] == 'CONSPIRACY']
df_conspiracy_augmented = df_augmented[df_augmented['category'] == 'CONSPIRACY']
critical_count = original_counts.get('CRITICAL', 0)  
conspiracy_count = original_counts.get('CONSPIRACY', 0)

In [None]:
df_conspiracy = df_augmented[df_augmented['category'] == 'CONSPIRACY']

# Seleccionar 1242 filas aleatorias para balancear el dataset 
df_conspiracy_sampled = df_conspiracy.sample(n=1242, random_state=42)
df_combined = pd.concat([df, df_conspiracy_sampled])

df_combined.reset_index(drop=True, inplace=True)
df=df_combined.copy()

# Añadir información del texto (emoción y moral)

In [None]:
def add_context(df):
    df['text'] =   df['text'] + '. The text reflects the emotion: ' + df['max_emotion'] + ' and the moral value: ' + df['max_moral'] 
    return df

df=add_context(df)
df.loc[0].text

# Parámetros

In [None]:
# Modificados en función del experimento realizado
#Bert
MAX_LEN = 512 
TRAIN_BATCH_SIZE = 32 
VALID_BATCH_SIZE = 32
EPOCHS = 3 
LEARNING_RATE = 2e-5 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Roberta
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 2e-5
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# Roberta Modificada
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModelForSequenceClassification.from_pretrained('roberta-base')

In [None]:
# Transformar etiquetas de categóricas a numéricas
#Critical = 1 Conspirancy = 0
df['class'] = df['category'].apply(lambda x: 1 if x == 'CRITICAL' else 0)

new_df = df[['text', 'class']].copy()
new_df

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# DataLoaders y Modelos Para los Diferentes Experimentos

# Experimentos Bert

## Modelo

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['text']
        self.targets = self.data['class']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        # Tokenize the text
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        ids = inputs['input_ids'].squeeze()
        mask = inputs['attention_mask'].squeeze()
        token_type_ids = inputs["token_type_ids"].squeeze()


        return {
            'ids': ids,
            'mask': mask,
            'token_type_ids': token_type_ids,
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        #1 es porque en este caso es una  tarea binaria
        self.l3 = torch.nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2).squeeze(1)
        return output

model = BERTClass()
model.to(device)

## Fine Tuning

In [None]:
def train(epoch, model, train_loader):
    model.train()  # Poner el modelo en modo de entrenamiento
    for _, data in enumerate(train_loader):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        optimizer.zero_grad()  # Limpiar los gradientes previos
        outputs = model(ids, mask, token_type_ids)  
        loss = loss_fn(outputs, targets)  # pérdida

        if _ % 5000 == 0:  
            print(f'Epoch: {epoch}, Loss: {loss.item()}')

        loss.backward()  
        optimizer.step()  

def validation(model, data_loader, device):
    model.eval()  
    fin_targets = []  # etiqueta verdadera
    fin_outputs = []  # predicciones

    with torch.no_grad():  
        for data in data_loader:
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            targets = data['targets'].to(device)

            # Get model outputs
            outputs = model(ids, mask, token_type_ids)  
            # funcion sigmoide para la salida como probabilidades
            outputs = torch.sigmoid(outputs)

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets


    # Convertir las salidas a 0 o 1 (clases predichas) basadas en el umbral 0.5
    outputs = [1 if x > 0.5 else 0 for x in outputs]

    # Calcular las métricas
    accuracy = accuracy_score(targets, outputs)
    precision, recall, f1, _ = precision_recall_fscore_support(targets, outputs, average='binary')
    mcc = matthews_corrcoef(targets, outputs)  # Calcular MCC

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'mcc': mcc
    }
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"MCC: {mcc:.4f}")

def cross_validate_model(model, dataframe, tokenizer,title, epochs=3, batch_size=16, k_folds=5):
    #kfold
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    metrics_list = [] 
    for fold, (train_idx, val_idx) in enumerate(kf.split(dataframe)):
        print(f"\nFold {fold + 1}/{k_folds}")
        train_df = dataframe.iloc[train_idx].reset_index(drop=True)
        val_df = dataframe.iloc[val_idx].reset_index(drop=True)

        # Dividir datos en entrenamiento y validación
        train_set = CustomDataset(train_df, tokenizer, MAX_LEN)
        val_set = CustomDataset(val_df, tokenizer, MAX_LEN)

        train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

        # Se entrena para cada fold
        for epoch in range(epochs):
            train(epoch, model, train_loader)  

        # Se valida
        outputs, targets = validation(model, val_loader, device)
        
        fold_metrics=evaluate_metrics(outputs, targets) 
        metrics_list.append(fold_metrics)
        
    metrics_df = pd.DataFrame(metrics_list)

    # Se guardan las métricas del entreno
    metrics_df.to_csv(f'metrics{title}.csv', index=False)

    print('Cross-validation complete')

# Cambiar el nombre del archivo que se guarda
cross_validate_model(model, new_df, tokenizer, 'moral_emotions_512_32_2e5_bbu', epochs=3, batch_size=TRAIN_BATCH_SIZE, k_folds=5)


## Test

In [None]:
test_df = pd.read_csv("data/test/dataset_en_test_cleaned.csv")
test_df['class'] = test_df['category'].apply(lambda x: 1 if x == 'CRITICAL' else 0)
test_df = test_df[['text', 'class']].copy()
test_df

model.eval()

#usar modelo y tokenizer
def predict(text, model, tokenizer):
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)

    with torch.no_grad():
        outputs = model(ids, mask, token_type_ids) 
        probabilities = torch.sigmoid(outputs).cpu().numpy()
        prediction = 1 if probabilities[0] >= 0.5 else 0

    return prediction, probabilities[0]

def test_and_evaluate(model, tokenizer, test_df, filename="test_results.json"):
    predictions = []
    probabilities = []
    for index, row in test_df.iterrows():
        text = row['text']
        prediction, probability = predict(text, model, tokenizer) 
        predictions.append(prediction)
        probabilities.append(probability)

    test_df['predictions'] = predictions
    test_df['probabilities'] = probabilities
    mcc = matthews_corrcoef(test_df['class'], test_df['predictions'])
    results = classification_report(
        test_df['class'], 
        test_df['predictions'], 
        target_names=['CONSPIRANCY', 'CRITICAL'],  
        digits=5, 
        output_dict=True)

    # Guardar
    output_data = {
        "mcc": mcc,
        "classification_report": results}

    with open(filename, "w") as f:
        json.dump(output_data, f, indent=4)

    print(f"Test results saved to {filename}")
    print(f"Test MCC = {mcc}")

#Cambiar nombre de fichero con métricas
test_and_evaluate(model, tokenizer, test_df, filename="test_evaluation_results_512_2e5_emotions_morals.json")

# Experimentos Roberta

## Modelo

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['text']
        self.targets = self.data['class']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt')

        ids = inputs['input_ids'].squeeze()
        mask = inputs['attention_mask'].squeeze()


        return {
            'ids': ids,
            'mask': mask,
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

class RoBERTaClass(torch.nn.Module):
    def __init__(self):
        super(RoBERTaClass, self).__init__()
        self.l1 = transformers.RobertaModel.from_pretrained('roberta-large')
        self.l2 = torch.nn.Dropout(0.3)
        #1 es porque en este caso es una  tarea binaria
        self.l3 = torch.nn.Linear(1024, 1)  #1024 para roberta-large

    def forward(self, ids, mask):
        _, output_1= self.l1(ids, attention_mask = mask, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2).squeeze(1)
        return output

model = RoBERTaClass()
model.to(device)

## Fine Tuning

In [None]:
def train(epoch, model, train_loader):
    model.train()  
    for _, data in enumerate(train_loader):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        optimizer.zero_grad()  # Limpiar los gradientes previos
        outputs = model(ids, mask)  # Propagación hacia adelante
        loss = loss_fn(outputs, targets) 

        if _ % 5000 == 0: 
            print(f'Epoch: {epoch}, Loss: {loss.item()}')

        loss.backward()  # Propagación hacia atrás
        optimizer.step()  # Actualizar los pesos del modelo

def validation(model, data_loader, device):
    model.eval()  
    fin_targets = []  # etiquetas verdaderas
    fin_outputs = []  # probabilidades de las predicciones

    with torch.no_grad():  
        for data in data_loader:
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            targets = data['targets'].to(device)

            outputs = model(ids, mask) 
            outputs = torch.sigmoid(outputs)

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets
    
def evaluate_metrics(outputs, targets):
    # Convertir las salidas a 0 o 1 (clases predichas) basadas en el umbral 0.5
    outputs = [1 if x > 0.5 else 0 for x in outputs]

    # Calcular las métricas
    accuracy = accuracy_score(targets, outputs)
    precision, recall, f1, _ = precision_recall_fscore_support(targets, outputs, average='binary')
    mcc = matthews_corrcoef(targets, outputs)  # Calcular MCC
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'mcc': mcc
    }
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"MCC: {mcc:.4f}")


def cross_validate_model(model, dataframe, tokenizer,title, epochs=3, batch_size=16, k_folds=5):
    #kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    #skfold es mejor opción cuando se tiene desbalanceo de datos
    kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

    metrics_list = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(dataframe, dataframe['class'])):
        print(f"\nFold {fold + 1}/{k_folds}")
        train_df = dataframe.iloc[train_idx].reset_index(drop=True)
        val_df = dataframe.iloc[val_idx].reset_index(drop=True)

        train_set = CustomDataset(train_df, tokenizer, MAX_LEN)
        val_set = CustomDataset(val_df, tokenizer, MAX_LEN)

        train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

        for epoch in range(epochs):
            train(epoch, model, train_loader)  

        outputs, targets = validation(model, val_loader, device)
        fold_metrics=evaluate_metrics(outputs, targets)
        metrics_list.append(fold_metrics)

    metrics_df = pd.DataFrame(metrics_list)

    metrics_df.to_csv(f'metrics{title}.csv', index=False)

    print('Cross-validation complete')

# cambiar nombre de archivo que se guarda
cross_validate_model(model, new_df, tokenizer, 'moral_emo_512_8_2e5_rl_skfold_augmented',epochs=3, batch_size=TRAIN_BATCH_SIZE, k_folds=5)

## Test

In [None]:
test_df = pd.read_csv("data/test/dataset_en_test_cleaned.csv")
test_df['class'] = test_df['category'].apply(lambda x: 1 if x == 'CRITICAL' else 0)
test_df = test_df[['text', 'class']].copy()
test_df

model.eval()
def predict(text, model, tokenizer):
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(ids, mask)
        probabilities = torch.sigmoid(outputs).cpu().numpy()
        prediction = 1 if probabilities[0] >= 0.5 else 0

    return prediction, probabilities[0]

def test_and_evaluate(model, tokenizer, test_df, filename="test_results.json"):
    predictions = []
    probabilities = []

    for index, row in test_df.iterrows():
        text = row['text']
        prediction, probability = predict(text, model, tokenizer)  
        predictions.append(prediction)
        probabilities.append(probability)

    test_df['predictions'] = predictions
    test_df['probabilities'] = probabilities

    mcc = matthews_corrcoef(test_df['class'], test_df['predictions'])
    results = classification_report(
        test_df['class'],
        test_df['predictions'],
        target_names=['CONSPIRANCY', 'CRITICAL'], 
        digits=5,
        output_dict=True
    )

    output_data = {
        "mcc": mcc,
        "classification_report": results
    }

    with open(filename, "w") as f:
        json.dump(output_data, f, indent=4)

    print(f"Test results saved to {filename}")
    print(f"Test MCC = {mcc}")

test_and_evaluate(model, tokenizer, test_df, filename="test_evaluation_results_512_2e5_moral_emotions_robertalarge_skfold_augmented.json")


# Experimentos Roberta Estructura Modificada

## Modelo

In [None]:
# CustomDataset igual que el de la clase Roberta

class RoBERTaClass(torch.nn.Module):
    def __init__(self):
        super(RoBERTaClass, self).__init__()
        # Cargar el modelo preentrenado roberta-base
        self.l1 = transformers.RobertaModel.from_pretrained('roberta-base')

        # Añadir Multi-Head Attention
        self.multihead_attention = nn.MultiheadAttention(embed_dim=768, num_heads=8, dropout=0.3, batch_first=True)

        # Dropout para regularización
        self.l2 = nn.Dropout(0.3)

        # Capa lineal para clasificación binaria
        self.l3 = nn.Linear(768, 1) 

    def forward(self, ids, mask):
        output = self.l1(ids, attention_mask=mask, return_dict=True)
        hidden_states = output.last_hidden_state  # (batch_size, seq_len, hidden_dim)

        # Paso 2: Aplicar Multi-Head Attention
        attn_output, _ = self.multihead_attention(hidden_states, hidden_states, hidden_states, key_padding_mask=~mask.bool())

        # Paso 3: Tomar el embedding del token [CLS] (posición 0)
        cls_embedding = attn_output[:, 0, :]  # [CLS] después de atención [batch_size, hidden_dim]

        # Paso 4: Aplicar Dropout
        output_2 = self.l2(cls_embedding)

        # Paso 5: Pasar por la capa de clasificación
        output = self.l3(output_2).squeeze(1)

        return output

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RoBERTaClass().to(device)

# Experimento uniendo Bert y Roberta

## Modelo

In [None]:

# Cargar los modelos y tokenizadores
tokenizer_roberta = transformers.RobertaTokenizer.from_pretrained('roberta-base')
model_roberta = transformers.RobertaModel.from_pretrained('roberta-base')

tokenizer_bert = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = transformers.BertModel.from_pretrained('bert-base-uncased')

# Función para procesar texto y obtener el [CLS] de RoBERTa y BERT
def process_text_with_metadatas(text, emotion, moral):
    # Paso 1: Crear el texto completo con los metadatos
    full_text = f"{text} Este texto refleja la emoción {emotion} y la moral {moral}"
    
    # Paso 2: Tokenizar el texto completo para RoBERTa
    inputs_roberta = tokenizer_roberta(full_text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs_roberta = model_roberta(**inputs_roberta)
    cls_roberta = outputs_roberta.last_hidden_state[:, 0, :]  # [CLS] de RoBERTa
    
    # Paso 3: Tokenizar el texto completo también para BERT (usamos el mismo texto con metadatos)
    inputs_bert = tokenizer_bert(full_text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs_bert = model_bert(**inputs_bert)
    cls_bert = outputs_bert.last_hidden_state[:, 0, :]  # [CLS] de BERT
    
    # Paso 4: Concatenar los [CLS] de RoBERTa y BERT
    combined_cls = torch.cat((cls_roberta, cls_bert), dim=1)  # Concatenación a lo largo de las dimensiones de características
    return combined_cls

In [None]:
MAX_LEN = 512  
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 2e-5
tokenizer = tokenizer_roberta

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['text']
        self.targets = self.data['class']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())  

        # Procesar el texto con 3l contexto de emociones y moral y obtener el [CLS] concatenado
        inputs = process_text_with_metadatas(text, self.data['max_emotion'][index], self.data['max_moral'][index])
        
        return {
            'cls': inputs.squeeze(),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)}

# Clasificador
class ClassifierModel(torch.nn.Module):
    def __init__(self, input_dim, output_dim=1):
        super(ClassifierModel, self).__init__()
        # Capa densa para la clasificación
        self.fc = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc(x)

¡model = ClassifierModel(input_dim=768 * 2)  # RoBERTa + BERT [CLS] (768 de cada uno)

# Función de pérdida y el optimizador
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)


## Train

In [None]:
def train(epoch, model, train_loader):
    model.train()  
    for _, data in enumerate(train_loader):
        cls = data['cls']
        targets = data['targets']
        optimizer.zero_grad()  
        outputs = model(cls) 
        loss = loss_fn(outputs.squeeze(), targets)  

        if _ % 5000 == 0:  
            print(f'Epoch: {epoch}, Loss: {loss.item()}')

        loss.backward() 
        optimizer.step()  

def validation(model, data_loader):
    model.eval()  
    fin_targets = []  
    fin_outputs = []  

    with torch.no_grad():  
        for data in data_loader:
            cls = data['cls']
            targets = data['targets']

            outputs = model(cls)  
            outputs = torch.sigmoid(outputs).squeeze()  

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets


def evaluate_metrics(outputs, targets):
    outputs = [1 if x > 0.5 else 0 for x in outputs]
    accuracy = accuracy_score(targets, outputs)
    precision, recall, f1, _ = precision_recall_fscore_support(targets, outputs, average='binary')
    mcc = matthews_corrcoef(targets, outputs)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'mcc': mcc
    }


def cross_validate_model(model, dataframe, tokenizer, title, epochs=3, batch_size=16, k_folds=5):
    kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

    metrics_list = [] 
    for fold, (train_idx, val_idx) in enumerate(kf.split(dataframe, dataframe['class'])):
        print(f"\nFold {fold + 1}/{k_folds}")
        train_df = dataframe.iloc[train_idx].reset_index(drop=True)
        val_df = dataframe.iloc[val_idx].reset_index(drop=True)

        train_set = CustomDataset(train_df, tokenizer, MAX_LEN)
        val_set = CustomDataset(val_df, tokenizer, MAX_LEN)

        train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

        for epoch in range(epochs):
            train(epoch, model, train_loader)  # Entrenamiento

        outputs, targets = validation(model, val_loader)
        fold_metrics = evaluate_metrics(outputs, targets) 
        metrics_list.append(fold_metrics)
        
    metrics_df = pd.DataFrame(metrics_list)
    metrics_df.to_csv(f'metrics_{title}.csv', index=False)

    print('Cross-validation complete')

cross_validate_model(model, new_df, tokenizer, "roberta_bert_combined", epochs=EPOCHS, batch_size=TRAIN_BATCH_SIZE)

## Test


In [None]:
test_df = pd.read_csv("data/test/dataset_en_test_cleaned.csv")
test_df['class'] = test_df['category'].apply(lambda x: 1 if x == 'CRITICAL' else 0)
test_df = test_df[['text', 'class']].copy()
test_df

def process_text_test(text):
    # Tokenizar el texto completo para RoBERTa
    inputs_roberta = tokenizer_roberta(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs_roberta = model_roberta(**inputs_roberta)
    cls_roberta = outputs_roberta.last_hidden_state[:, 0, :]  # [CLS] de RoBERTa
    
    # Tokenizar el texto también para BERT
    inputs_bert = tokenizer_bert(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs_bert = model_bert(**inputs_bert)
    cls_bert = outputs_bert.last_hidden_state[:, 0, :]  # [CLS] de BERT
    
    # Concatenar los [CLS] de RoBERTa y BERT
    combined_cls = torch.cat((cls_roberta, cls_bert), dim=1)  # Concatenación a lo largo de las dimensiones de características
    return combined_cls
    
def predict(text, model, tokenizer):
    # Procesar el texto y obtener el [CLS] concatenado de RoBERTa y BERT
    inputs = process_text_test(text)
    inputs = inputs.squeeze()  
    
    with torch.no_grad():
        outputs = model(inputs)  
        outputs = torch.sigmoid(outputs).squeeze()  

        probability = outputs.item()  
        prediction = 1 if probability >= 0.5 else 0 

def test_and_evaluate(model, tokenizer, test_df, filename="test_results.json"):
    predictions = []
    probabilities = []

    for index, row in test_df.iterrows():
        text = row['text']
        prediction, probability = predict(text, model, tokenizer)
        predictions.append(prediction)
        probabilities.append(probability)

    test_df['predictions'] = predictions
    test_df['probabilities'] = probabilities
    mcc = matthews_corrcoef(test_df['class'], test_df['predictions'])

    results = classification_report(
        test_df['class'],
        test_df['predictions'],
        target_names=['CONSPIRACY', 'CRITICAL'],
        digits=5,
        output_dict=True
    )

    # Guardar los resultados y MCC 
    output_data = {
        "mcc": mcc,
        "classification_report": results
    }

    with open(filename, "w") as f:
        json.dump(output_data, f, indent=4)

    print(f"Test results saved to {filename}")
    print(f"Test MCC = {mcc}")

test_df = pd.read_csv("data/test/dataset_en_test_cleaned.csv")
test_df['class'] = test_df['category'].apply(lambda x: 1 if x == 'CRITICAL' else 0)
test_df = test_df[['text', 'class']].copy()

test_and_evaluate(model, tokenizer_roberta, test_df, filename="test_evaluation_results_bert_roberta_combined.json")