# Fine Tuning Transformer for Text Classification

In [6]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
import torch.nn as nn
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import StratifiedKFold
import json
from transformers import AutoModel, AutoTokenizer


In [7]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Data

In [12]:
df = pd.read_csv('data/train/dataset_en_train_completed.csv')
df=df.iloc[:4000,:]
df.category.value_counts()

category
CRITICAL      2621
CONSPIRACY    1379
Name: count, dtype: int64

# Balance Data

In [None]:
df= pd.read_csv('data/train/dataset_en_train_completed.csv')
df= df.iloc[:4000]

df_augmented= pd.read_csv('data/train/dataset_en_train_completed.csv')
df_augmented= df_augmented.iloc[4000:]
#df_augmented

In [None]:
original_counts = df['category'].value_counts()
df_conspiracy = df[df['category'] == 'CONSPIRACY']
df_conspiracy_augmented = df_augmented[df_augmented['category'] == 'CONSPIRACY']
critical_count = original_counts.get('CRITICAL', 0) 
conspiracy_count = original_counts.get('CONSPIRACY', 0)


In [None]:
df_conspiracy = df_augmented[df_augmented['category'] == 'CONSPIRACY']

# Seleccionar 1242 filas aleatorias para balancear el dataset 
df_conspiracy_sampled = df_conspiracy.sample(n=1242, random_state=42)
df_combined = pd.concat([df, df_conspiracy_sampled])

df_combined.reset_index(drop=True, inplace=True)
df=df_combined.copy()

# Context

In [None]:
def add_context(df):
    # Añade información contextual usando corchetes
    df['text'] =   df['text'] + '. The text reflects the emotion: ' + df['max_emotion'] + ' and the moral value: ' + df['max_moral']
    return df


df=add_context(df)
df.loc[0].text

'how the cia is directly involved in every aspect of the creation of the vaccine passports . groups created by the cia like palantir , mitre , oracle , and google , funded through the cias venture capital firm , in q tel , are are involved . every one of them are listed on the organizational member lists of the private companies in charge of the creation of all worldwide passports . full article . dailyveracity . com the shadowy cia data firms behind the creation of digital vaccine passport ids full video . bitchute . com video ufysjysoe donna. The text reflects the emotion: inspiration and the moral value loyalty'

# Transform label from categoric to numeric

Critical = 1
Conspirancy = 0

In [None]:
df['class'] = df['category'].apply(lambda x: 1 if x == 'CRITICAL' else 0)

new_df = df[['text', 'class']].copy()
new_df

Unnamed: 0,text,class
0,how the cia is directly involved in every aspe...,0
1,elon musk admits i felt like i was dying after...,1
2,the uk gov. quietly published data confirming ...,1
3,"the global economic terror regime, which is lo...",0
4,confirmed pharma fags crashed into a covid vac...,1
...,...,...
5237,coming soon doctors for covid ethics fourth sy...,1
5238,the vaccine mandates as i said last year have ...,1
5239,Minority report in real life a group of social...,0
5240,This is the reason why the Cabal does not want...,0


# Preparing the Dataset and Dataloader



In [15]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 2e-5
tokenizer = AutoTokenizer.from_pretrained("dunzhang/stella_en_400M_v5", trust_remote_code=True)

In [16]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['text']
        self.targets = self.data['class']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        # Tokenize the text
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        ids = inputs['input_ids'].squeeze()
        mask = inputs['attention_mask'].squeeze()


        return {
            'ids': ids,
            'mask': mask,
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

# Creating the Neural Network for Fine Tuning


In [21]:
pip install xformers

Collecting xformers
  Using cached xformers-0.0.28.post3.tar.gz (7.8 MB)
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\paula\\AppData\\Local\\Temp\\pip-install-oh4xa3sc\\xformers_fba3dc0ef01d47d5ab6988f6ff516857\\third_party\\flash-attention\\csrc\\composable_kernel\\client_example\\24_grouped_conv_activation\\grouped_convnd_bwd_data_bilinear\\grouped_conv_bwd_data_bilinear_residual_fp16.cpp'



In [20]:
model = AutoModel.from_pretrained("dunzhang/stella_en_400M_v5", trust_remote_code=True)
model.to(device)

AssertionError: please install xformers

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)


In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Fine Tuning and Validating the Model (Cross Validation)

In [None]:
def train(epoch, model, train_loader):
    model.train()  
    for _, data in enumerate(train_loader):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        optimizer.zero_grad()  # Limpiar los gradientes previos
        outputs = model(ids, mask)  # Propagación hacia adelante
        loss = loss_fn(outputs, targets) 

        if _ % 5000 == 0: 
            print(f'Epoch: {epoch}, Loss: {loss.item()}')

        loss.backward()  # Propagación hacia atrás
        optimizer.step()  # Actualizar los pesos del modelo

def validation(model, data_loader, device):
    model.eval()  
    fin_targets = []  # etiquetas verdaderas
    fin_outputs = []  # probabilidades de las predicciones

    with torch.no_grad():  
        for data in data_loader:
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            targets = data['targets'].to(device)

            outputs = model(ids, mask) 
            outputs = torch.sigmoid(outputs)

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets
    


def evaluate_metrics(outputs, targets):
    # Convertir las salidas a 0 o 1 (clases predichas) basadas en el umbral 0.5
    outputs = [1 if x > 0.5 else 0 for x in outputs]

    # Calcular las métricas
    accuracy = accuracy_score(targets, outputs)
    precision, recall, f1, _ = precision_recall_fscore_support(targets, outputs, average='binary')
    mcc = matthews_corrcoef(targets, outputs)  # Calcular MCC
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'mcc': mcc
    }
    # Imprimir los resultados
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"MCC: {mcc:.4f}")


def cross_validate_model(model, dataframe, tokenizer,title, epochs=3, batch_size=16, k_folds=5):
    #kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    #skfold es mejor opción cuando se tiene desbalanceo de datos
    kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

    metrics_list = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(dataframe, dataframe['class'])):
        print(f"\nFold {fold + 1}/{k_folds}")
        train_df = dataframe.iloc[train_idx].reset_index(drop=True)
        val_df = dataframe.iloc[val_idx].reset_index(drop=True)

        train_set = CustomDataset(train_df, tokenizer, MAX_LEN)
        val_set = CustomDataset(val_df, tokenizer, MAX_LEN)

        train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

        for epoch in range(epochs):
            train(epoch, model, train_loader)  

        outputs, targets = validation(model, val_loader, device)
        fold_metrics=evaluate_metrics(outputs, targets)
        metrics_list.append(fold_metrics)

    metrics_df = pd.DataFrame(metrics_list)

    # Guardar el DataFrame en un archivo CSV
    metrics_df.to_csv(f'metrics{title}.csv', index=False)

    print('Cross-validation complete')

In [None]:
# cambiar nombre de archivo que se guarda
cross_validate_model(model, new_df, tokenizer, 'moral_emo_512_8_2e5_rl_skfold_augmented',epochs=3, batch_size=TRAIN_BATCH_SIZE, k_folds=5)



Fold 1/5
Epoch: 0, Loss: 0.7348228096961975
Epoch: 1, Loss: 0.4871571660041809
Epoch: 2, Loss: 0.018146932125091553

Fold 2/5
Epoch: 0, Loss: 0.007466912269592285
Epoch: 1, Loss: 0.008322915062308311
Epoch: 2, Loss: 0.0579494945704937

Fold 3/5
Epoch: 0, Loss: 0.003274182789027691
Epoch: 1, Loss: 0.0032785036601126194
Epoch: 2, Loss: 0.0026664207689464092

Fold 4/5
Epoch: 0, Loss: 0.008489318192005157
Epoch: 1, Loss: 0.004815469030290842
Epoch: 2, Loss: 0.0007440482149831951

Fold 5/5
Epoch: 0, Loss: 0.00466996431350708
Epoch: 1, Loss: 0.004115194547921419
Epoch: 2, Loss: 0.001830673310905695
Cross-validation complete


# Saving the Trained Model

In [None]:
MODEL_PATH = "roberta_large_finetuned_model_en.pth"

torch.save(model.state_dict(), MODEL_PATH)


# Load the Saving Model

In [None]:
# Define the paths to save the model and tokenizer
'''
MODEL_PATH = "./model/roberta_finetuned_model.pth"

loaded_model = RoBERTaClass()
loaded_model.load_state_dict(torch.load(MODEL_PATH))
loaded_model.to(device)
loaded_model.eval()'''

# Test

In [None]:
test_df = pd.read_csv("data/test/dataset_en_test_cleaned.csv")
test_df['class'] = test_df['category'].apply(lambda x: 1 if x == 'CRITICAL' else 0)
test_df = test_df[['text', 'class']].copy()
test_df

Unnamed: 0,text,class
0,elon now confirming what we ve been suspecting...,0
1,keeping the pressure on the police to uphold t...,0
2,safe effective the greatest lie ever told . th...,1
3,cdc report admits . million people in the usa ...,1
4,how to use health to acquire totalitarian cont...,0
...,...,...
995,john d. rockefeller wiped out natural cures to...,0
996,fact check biden white house falsely accuses d...,1
997,w onset acral hand lesions following mrna vacc...,1
998,we will fire unvaccinated workers cohen hadad ...,1


In [None]:
model.eval()
def predict(text, model, tokenizer):
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(ids, mask)
        probabilities = torch.sigmoid(outputs).cpu().numpy()
        prediction = 1 if probabilities[0] >= 0.5 else 0

    return prediction, probabilities[0]


def test_and_evaluate(model, tokenizer, test_df, filename="test_results.json"):
    predictions = []
    probabilities = []

    for index, row in test_df.iterrows():
        text = row['text']
        prediction, probability = predict(text, model, tokenizer)  
        predictions.append(prediction)
        probabilities.append(probability)

    test_df['predictions'] = predictions
    test_df['probabilities'] = probabilities

    mcc = matthews_corrcoef(test_df['class'], test_df['predictions'])
    results = classification_report(
        test_df['class'],
        test_df['predictions'],
        target_names=['CONSPIRANCY', 'CRITICAL'], 
        digits=5,
        output_dict=True
    )

    output_data = {
        "mcc": mcc,
        "classification_report": results
    }

    with open(filename, "w") as f:
        json.dump(output_data, f, indent=4)

    print(f"Test results saved to {filename}")
    print(f"Test MCC = {mcc}")

# Example usage:
test_and_evaluate(model, tokenizer, test_df, filename="test_evaluation_results_512_2e5_moral_emotions_robertalarge_skfold_augmented.json")

Test results saved to test_evaluation_results_512_2e5_moral_emotions_robertalarge_skfold_augmented.json
Test MCC = 0.8244584282457748
