In [1]:
import json
import torch
import transformers
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, classification_report

  warn(f"Failed to load image Python extension: {e}")


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Datos

In [3]:
df = pd.read_csv('data/train/dataset_en_train_completed.csv')


# Datos Balanceados

In [4]:
df= pd.read_csv('data/train/dataset_en_train_completed.csv')
df= df.iloc[:4000]
print(df.category.value_counts())

df_augmented= pd.read_csv('data/train/dataset_en_train_completed.csv')
df_augmented= df_augmented.iloc[4000:]


category
CRITICAL      2621
CONSPIRACY    1379
Name: count, dtype: int64


In [5]:
original_counts = df['category'].value_counts()
df_conspiracy = df[df['category'] == 'CONSPIRACY']
df_conspiracy_augmented = df_augmented[df_augmented['category'] == 'CONSPIRACY']
critical_count = original_counts.get('CRITICAL', 0)  
conspiracy_count = original_counts.get('CONSPIRACY', 0)


In [6]:
df_conspiracy = df_augmented[df_augmented['category'] == 'CONSPIRACY']

# Seleccionar 1242 filas aleatorias para balancear el dataset 
df_conspiracy_sampled = df_conspiracy.sample(n=1242, random_state=42)
df_combined = pd.concat([df, df_conspiracy_sampled])

df_combined.reset_index(drop=True, inplace=True)
df=df_combined.copy()


# Add Context

In [7]:
def add_context(df):
    # Añade información contextual usando corchetes
    df['text'] =   df['text'] + '. The text reflects the emotion: ' + df['max_emotion'] + ' and the moral value: ' + df['max_moral'] 
    return df

df=add_context(df)
df.loc[0].text

'this is massive australian senator malcolm roberts exposes nanotech found in the covid vaccines and says they are genocide . he is the first politician to expose this share . me lauraabolichannel. The text reflects the emotion: inspiration and the moral value: no moral'

# Transformar etiqueta de categórica a numérica 

Critical = 1
Conspirancy = 0

In [8]:
df['class'] = df['category'].apply(lambda x: 1 if x == 'CRITICAL' else 0)

new_df = df[['text', 'class']].copy()
new_df

Unnamed: 0,text,class
0,this is massive australian senator malcolm rob...,0
1,i m deeply concerned that the push to vaccinat...,1
2,they wanted to know your vaccination status an...,1
3,anthony fauci once again defended brutal chine...,1
4,proof has emerged showing that death from wuha...,1
...,...,...
5237,. zickute . com video ubxufaftemjq uk police c...,0
5238,get a claim in for medical battery. nbcnews. c...,0
5239,We warn against ... vaccine mandates health pa...,0
5240,Vaccination center in leaks Crime against huma...,0


# Parámetros

In [9]:
MAX_LEN = 512 
TRAIN_BATCH_SIZE = 32 
VALID_BATCH_SIZE = 32
EPOCHS = 3 
LEARNING_RATE = 2e-5 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['text']
        self.targets = self.data['class']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        ids = inputs['input_ids'].squeeze()
        mask = inputs['attention_mask'].squeeze()
        token_type_ids = inputs["token_type_ids"].squeeze()


        return {
            'ids': ids,
            'mask': mask,
            'token_type_ids': token_type_ids,
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

# Modelo

In [11]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        #1 es porque en este caso es una  tarea binaria
        self.l3 = torch.nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2).squeeze(1)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [12]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [13]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Fine Tuning (Cross Validation)

In [14]:
def train(epoch, model, train_loader):
    model.train()  # Poner el modelo en modo de entrenamiento
    for _, data in enumerate(train_loader):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        optimizer.zero_grad()  # Limpiar los gradientes previos
        outputs = model(ids, mask, token_type_ids)  
        loss = loss_fn(outputs, targets)  # pérdida

        if _ % 5000 == 0:  
            print(f'Epoch: {epoch}, Loss: {loss.item()}')

        loss.backward()  
        optimizer.step()  

def validation(model, data_loader, device):
    model.eval()  
    fin_targets = []  # etiqueta verdadera
    fin_outputs = []  # predicciones

    with torch.no_grad():  
        for data in data_loader:
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            targets = data['targets'].to(device)

            # Get model outputs
            outputs = model(ids, mask, token_type_ids)  
            # funcion sigmoide para la salida como probabilidades
            outputs = torch.sigmoid(outputs)

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets


def evaluate_metrics(outputs, targets):
    # Convertir las salidas a 0 o 1 (clases predichas) basadas en el umbral 0.5
    outputs = [1 if x > 0.5 else 0 for x in outputs]

    # Calcular las métricas
    accuracy = accuracy_score(targets, outputs)
    precision, recall, f1, _ = precision_recall_fscore_support(targets, outputs, average='binary')
    mcc = matthews_corrcoef(targets, outputs)  # Calcular MCC

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'mcc': mcc
    }
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"MCC: {mcc:.4f}")


def cross_validate_model(model, dataframe, tokenizer,title, epochs=3, batch_size=16, k_folds=5):
    #kfold
    #kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
    metrics_list = [] 
    for fold, (train_idx, val_idx) in enumerate(kf.split(dataframe, dataframe['class'])):
        print(f"\nFold {fold + 1}/{k_folds}")
        train_df = dataframe.iloc[train_idx].reset_index(drop=True)
        val_df = dataframe.iloc[val_idx].reset_index(drop=True)

        # Dividir datos en entrenamiento y validación
        train_set = CustomDataset(train_df, tokenizer, MAX_LEN)
        val_set = CustomDataset(val_df, tokenizer, MAX_LEN)

        train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

        # Se entrena para cada fold
        for epoch in range(epochs):
            train(epoch, model, train_loader)  

        # Se valida
        outputs, targets = validation(model, val_loader, device)
        
        fold_metrics=evaluate_metrics(outputs, targets) 
        metrics_list.append(fold_metrics)
        
    metrics_df = pd.DataFrame(metrics_list)

    # Se guardan las métricas del entreno
    metrics_df.to_csv(f'metrics{title}.csv', index=False)

    print('Cross-validation complete')

In [15]:
# Cambiar el nombre del archivo que se guarda
cross_validate_model(model, new_df, tokenizer, 'moral_emotions_512_32_2e5_bbu_skfold_dataaugmentation', epochs=3, batch_size=TRAIN_BATCH_SIZE, k_folds=5)



Fold 1/5
Epoch: 0, Loss: 0.6655012369155884
Epoch: 1, Loss: 0.201736718416214
Epoch: 2, Loss: 0.10958785563707352

Fold 2/5
Epoch: 0, Loss: 0.2270304411649704
Epoch: 1, Loss: 0.023486953228712082
Epoch: 2, Loss: 0.004546192474663258

Fold 3/5
Epoch: 0, Loss: 0.004241167567670345
Epoch: 1, Loss: 0.16472357511520386
Epoch: 2, Loss: 0.001608029706403613

Fold 4/5
Epoch: 0, Loss: 0.001138518680818379

Fold 5/5
Epoch: 0, Loss: 0.0008994214003905654
Epoch: 1, Loss: 0.005568553693592548
Epoch: 2, Loss: 0.0004384811036288738
Cross-validation complete


# Guardar Modelo (opcional)

In [None]:
#MODEL_PATH = "/content/drive/MyDrive/MASTER/NLP/model/bert_finetuned_model_en.pth"

#torch.save(model.state_dict(), MODEL_PATH)

# Test

In [16]:
test_df = pd.read_csv("data/test/dataset_en_test_cleaned.csv")
test_df['class'] = test_df['category'].apply(lambda x: 1 if x == 'CRITICAL' else 0)
test_df = test_df[['text', 'class']].copy()
test_df

Unnamed: 0,text,class
0,elon now confirming what we ve been suspecting...,0
1,keeping the pressure on the police to uphold t...,0
2,safe effective the greatest lie ever told . th...,1
3,cdc report admits . million people in the usa ...,1
4,how to use health to acquire totalitarian cont...,0
...,...,...
995,john d. rockefeller wiped out natural cures to...,0
996,fact check biden white house falsely accuses d...,1
997,w onset acral hand lesions following mrna vacc...,1
998,we will fire unvaccinated workers cohen hadad ...,1


In [17]:
model.eval()

#usar modelo
def predict(text, model, tokenizer):
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)

    with torch.no_grad():
        outputs = model(ids, mask, token_type_ids) 
        probabilities = torch.sigmoid(outputs).cpu().numpy()
        prediction = 1 if probabilities[0] >= 0.5 else 0

    return prediction, probabilities[0]


    
def test_and_evaluate(model, tokenizer, test_df, filename="test_results.json"):
    predictions = []
    probabilities = []
    for index, row in test_df.iterrows():
        text = row['text']
        prediction, probability = predict(text, model, tokenizer) 
        predictions.append(prediction)
        probabilities.append(probability)

    test_df['predictions'] = predictions
    test_df['probabilities'] = probabilities

    mcc = matthews_corrcoef(test_df['class'], test_df['predictions'])
    
    results = classification_report(
        test_df['class'], 
        test_df['predictions'], 
        target_names=['CONSPIRANCY', 'CRITICAL'], 
        digits=5, 
        output_dict=True
    )



    # Guardar
    output_data = {
        "mcc": mcc,
        "classification_report": results
    }

    with open(filename, "w") as f:
        json.dump(output_data, f, indent=4)

    print(f"Test results saved to {filename}")
    print(f"Test MCC = {mcc}")

test_and_evaluate(model, tokenizer, test_df, filename="test_evaluation_results_512_2e5_emotions_morals_dataaugmentation2.json")

Test results saved to test_evaluation_results_512_2e5_emotions_morals_dataaugmentation2.json
Test MCC = 0.7912231864845695
