In [None]:
! pip install -U git+https://github.com/huggingface/transformers.git
! pip install -U git+https://github.com/huggingface/accelerate.git
!pip install datasets

In [18]:
# -- Libraries
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from datasets import Dataset
from tqdm import tqdm
import pandas   as pd
import numpy    as np
import torch.nn as nn
import itertools
import torch
import glob
import ast

# Read data

In [3]:
# -- Load train data
files_list  = glob.glob('/kaggle/input/mentalris-original-data/train_data/task2_train/*')
subjects_id = [[file.split('/')[-1].replace('.json', '')] * pd.read_json(file).shape[0] for file in files_list]
subjects_id = list(itertools.chain(*subjects_id))

train_df    = pd.concat([pd.read_json(file) for file in files_list],
                         axis=0)
train_df['Subject'] = subjects_id
train_df_labels     = pd.read_table('/kaggle/input/mentalris-original-data/train_data/task2_gold_a.txt', sep=',')
train_df = train_df.merge(train_df_labels, on='Subject')

# -- Load trial data
files_list  = glob.glob('/kaggle/input/mentalris-original-data/trial_data/task2_trial/*')
subjects_id = [[file.split('/')[-1].replace('.json', '')] * pd.read_json(file).shape[0] for file in files_list]
subjects_id = list(itertools.chain(*subjects_id))

trial_df            = pd.concat([pd.read_json(file) for file in files_list],
                                 axis=0)
trial_df['Subject'] = subjects_id
trial_df_labels     = pd.read_table('/kaggle/input/mentalris-original-data/trial_data/task2_gold_a.txt', sep=',')
trial_df            = trial_df.merge(trial_df_labels, on='Subject')

train_df = pd.concat([trial_df, train_df], axis=0)
train_df = train_df[['Subject', 'message', 'label']]

# -- Load test data
test_df = pd.read_excel('/kaggle/input/test-data-excel-formatted/test_data.xlsx')
test_df_labels = pd.read_table('/kaggle/input/test-gold-labels/task2_gold_a.csv', sep=',')
test_df        = test_df.merge(test_df_labels, left_on='nick', right_on='Subject')
test_df = test_df[['nick', 'message', 'label']]


train_df.rename(columns={'Subject': 'Subject_ID', 'message': 'Text', 'label': 'type'}, inplace=True)
test_df.rename(columns={'nick': 'Subject_ID', 'message': 'Text', 'label': 'type'}, inplace=True)

In [4]:
train_df['Class'] = 'train'
test_df['Class'] = 'test'
erisk_df_no_blank_posts = pd.concat([train_df, test_df], axis=0)
erisk_df_no_blank_posts.head(5)

Unnamed: 0,Subject_ID,Text,type,Class
0,subject10,umm pues como te explico ... mal : ´ (,1,train
1,subject10,"pues jajaj seria muy bueno , me pasan demasiad...",1,train
2,subject10,"ojala y solo fuese ese el problema , en mi cas...",1,train
3,subject10,"pues son varias , me gusta bailar de hecho soy...",1,train
4,subject10,"eso es lo que trato ahora , me cansé de presta...",1,train


__Group data__

In [5]:
erisk_df_no_blank_posts = erisk_df_no_blank_posts.groupby(['Subject_ID', 'type', 'Class'])['Text'].apply(lambda x: ' '.join(x)).reset_index()
erisk_df_no_blank_posts

Unnamed: 0,Subject_ID,type,Class,Text
0,subject1,0,train,Bien ... técnicamente debería irme a dormir ca...
1,subject10,1,train,umm pues como te explico ... mal : ´ ( pues ja...
2,subject100,1,train,"Hola , estoy realmente mal , no se que hacer c..."
3,subject101,1,train,volvi me extrañaron signo de interrogación y t...
4,subject102,1,train,"¿ Cuanto tiempo duraron así ? sí , pero ya nos..."
...,...,...,...,...
329,subject86,0,test,La soledad y el tiempo son el mejor aleado par...
330,subject9,0,test,Hola necesito consejos tengo una hija con depr...
331,subject91,0,test,El insomnio no me deja Hay un virus dando cuen...
332,subject92,0,test,Alguna chica me consuela ? Mejor llorar en dos...


In [6]:
# Verificar si hay una GPU disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [7]:
# Definir el modelo y el tokenizer
model_name = "PlanTL-GOB-ES/roberta-base-bne"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# Definir los datos de entrenamiento y prueba (como DataFrames)
# Reemplaza train_df y test_df con tus propios DataFrames
erisk_df_no_blank_posts.rename(columns={'type': 'labels', 'Text': 'text'}, inplace=True)
train_df, test_df = erisk_df_no_blank_posts[erisk_df_no_blank_posts['Class'] == 'train'],\
                    erisk_df_no_blank_posts[erisk_df_no_blank_posts['Class'] == 'test']

test_df  = test_df[['labels', 'text']]
train_df = train_df[['labels', 'text']]

# Función para la tokenización y codificación de los datos
def tokenize_data(data):
    texts = data["text"].tolist()
    labels = data["labels"].tolist()  # Ajusta el nombre de la columna que contiene las etiquetas
    encoded_inputs = tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt")
    encoded_inputs["labels"] = torch.tensor(labels)
    return encoded_inputs

train_encoded = tokenize_data(train_df)
val_encoded = tokenize_data(test_df)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Convertir los datos codificados en objetos Dataset
train_dataset = Dataset.from_dict(train_encoded)
val_dataset = Dataset.from_dict(val_encoded)

In [9]:
# Configuración del entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="eval_f1",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_steps=500,
)

In [10]:
# Métricas personalizadas
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    
    f1 = f1_score(labels, predictions, average="binary")
    f1_weighted = f1_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="binary")
    recall_macro = recall_score(labels, predictions, average="macro")
    precision = precision_score(labels, predictions, average="binary")
    precision_macro = precision_score(labels, predictions, average="macro")
    
    return {
        "eval_f1": f1,
        "eval_f1_weighted": f1_weighted,
        "eval_recall": recall,
        "eval_recall_macro": recall_macro,
        "eval_precision": precision,
        "eval_precision_macro": precision_macro,
    }

In [11]:
# Entrenamiento del modelo
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,F1,F1 Weighted,Recall,Recall Macro,Precision,Precision Macro
1,0.6808,0.686718,0.626728,0.286024,1.0,0.5,0.456376,0.228188
2,0.6469,0.599662,0.702703,0.599725,0.955882,0.656954,0.555556,0.730903
3,0.4462,0.400871,0.814815,0.832108,0.808824,0.830338,0.820896,0.831179
4,0.3408,0.481755,0.792208,0.784363,0.897059,0.794208,0.709302,0.799096
5,0.17,0.444674,0.786667,0.785099,0.867647,0.791848,0.719512,0.792592


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=60, training_loss=0.41928551395734154, metrics={'train_runtime': 131.7242, 'train_samples_per_second': 7.022, 'train_steps_per_second': 0.455, 'total_flos': 243377726208000.0, 'train_loss': 0.41928551395734154, 'epoch': 5.0})

# Test phase

In [13]:
# -- Load best model obtained from training phase
model = RobertaForSequenceClassification.from_pretrained("/kaggle/working/results/checkpoint-36")

In [20]:
# Configuración del entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="eval_f1",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_steps=500,
    disable_tqdm=True
)

In [21]:
# Entrenamiento del modelo
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics
)

In [22]:
# Función para la tokenización y codificación de los datos
def tokenize_data(data):
    texts = data["text"].tolist()
    encoded_inputs = tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt")
    return encoded_inputs

In [23]:
train_df_original = pd.read_excel('/kaggle/input/test-data-excel-formatted/test_data.xlsx')
train_df_original.rename(columns={'message': 'text'}, inplace=True)
unique_nicks = train_df_original['nick'].unique()
final_df = []
with torch.no_grad():
    for nick in tqdm(unique_nicks):
        unique_rounds = train_df_original[train_df_original['nick'] == nick]['round'].unique()
        for round_ in unique_rounds:
            temp = train_df_original[(train_df_original['nick'] == nick) &\
                                     (train_df_original['round'] <= round_)][['nick' ,'text']]
            temp = temp.groupby(['nick'])['text'].apply(lambda x: ' '.join(x)).reset_index()[['text']]
            test_encoded = tokenize_data(temp)
            test_dataset = Dataset.from_dict(test_encoded)
            preds = trainer.predict(test_dataset).predictions[0]
            pred  = list(preds).index(max(preds))
            final_df.append([nick, round_, pred])

100%|██████████| 149/149 [04:50<00:00,  1.95s/it]


In [24]:
final_df = pd.DataFrame(final_df)
final_df.rename(columns={0: 'nick', 1: 'round', 2: 'pred'}, inplace=True)

In [25]:
# Find the maximum prediction and the round where it occurs for each nick
result = final_df.groupby('nick').agg({'pred': 'max', 'round': 'idxmax'})
result.reset_index(inplace=True)
result

Unnamed: 0,nick,pred,round
0,subject184,1,21
1,subject185,1,47
2,subject186,1,64
3,subject188,1,97
4,subject190,1,141
...,...,...,...
144,subject86,0,4980
145,subject9,1,5026
146,subject91,1,5067
147,subject92,0,5113


In [27]:
###########################################################################
import sklearn.metrics as metrics
from scipy.stats import pearsonr
import os
import statistics
# Read gold labels for binary classification (task1a, task2a, and task3a)
def read_qrels(qrels_file):
    qrels={}
    df_golden_truth = pd.read_csv(qrels_file)
    for index, r in df_golden_truth.iterrows():
        qrels[ r['Subject'] ] = int(r['label'])
    print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
    return(qrels)

# Calculation of Binary classification metrics for Binary classification tasks
class BinaryClassification():
    def __init__(self, task, data, qrels):
        self.run_results = data 
        self.qrels_b = read_qrels(qrels)
        self.task = task
    pass

    def penalty(self,delay):
        if self.task == "1": # TCA
            p = 0.0292 # trial
            p = 0.0411 # test
        elif self.task == "2": # Depression
            p = 0.0179 # trial
            p = 0.0326 # test
        else: # Unkown
            p = 0.0308 # test
        pen = -1.0 + 2.0/(1+np.exp(-p*(delay-1)))
        return(pen)

    def n_pos(self):
        total_pos = 0
        for key in self.qrels_b:
            total_pos += self.qrels_b[key]
        return(total_pos)

    def eval_performance(self):
        print("===================================================")
        print("EVALUATION:")
        self.run_results = self.run_results.sort_values(by=['nick'])
        total_pos=self.n_pos()
        erdes5 = np.zeros(len(self.run_results))
        erdes30 = np.zeros(len(self.run_results))
        ierdes = 0
        true_pos = 0
        false_pos = 0
        latency_tps = list()
        penalty_tps = list()

        # Latency-based metrics
        for index, r in self.run_results.iterrows():
            try:
                if ( self.qrels_b[ r['nick'] ] ==  r['pred'] ):
                    if ( r['pred'] == 1 ):
                        true_pos+=1
                        erdes5[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 5.0)))
                        erdes30[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 30.0)))
                        latency_tps.append(r["round"]+1)
                        penalty_tps.append(self.penalty(r["round"]+1))
                    else:
                        erdes5[ierdes]=0
                        erdes30[ierdes]=0
                else:
                    if ( r['pred'] == 1 ):
                        false_pos+=1
                        erdes5[ierdes]=float(total_pos)/float(len(self.qrels_b))
                        erdes30[ierdes]=float(total_pos)/float(len(self.qrels_b))
                    else:
                        erdes5[ierdes]=1
                        erdes30[ierdes]=1
            except KeyError:
                print("User does not appear in the qrels:"+r['nick'])
            ierdes+=1

        _speed = 1-np.median(np.array(penalty_tps))
        if true_pos != 0 :
            precision = float(true_pos) / float(true_pos+false_pos)    
            recall = float(true_pos) / float(total_pos)
            f1_erde = 2 * (precision * recall) / (precision + recall)
            _latencyweightedF1 = f1_erde*_speed
        else:
            _latencyweightedF1 = 0
            _speed = 0
            
        y_pred_b = self.run_results['pred'].tolist() 
        y_true = list(self.qrels_b.values()) 

        # Binary metrics
        accuracy = metrics.accuracy_score(y_true, y_pred_b)
        macro_precision = metrics.precision_score(y_true, y_pred_b, average='macro')
        macro_recall = metrics.recall_score(y_true, y_pred_b, average='macro')
        macro_f1 = metrics.f1_score(y_true, y_pred_b, average='macro') 
        micro_precision = metrics.precision_score(y_true, y_pred_b, average='micro')
        micro_recall = metrics.recall_score(y_true, y_pred_b, average='micro')
        micro_f1 = metrics.f1_score(y_true, y_pred_b, average='micro')

        print("BINARY METRICS: =============================")
        print("Accuracy:"+str(accuracy))
        print("Macro precision:"+str(macro_precision))
        print("Macro recall:"+str(macro_recall))
        print("Macro f1:"+str(macro_f1))
        print("Micro precision:"+str(micro_precision))
        print("Micro recall:"+str(micro_recall))
        print("Micro f1:"+str(micro_f1))

        print("LATENCY-BASED METRICS: =============================")
        print("ERDE_5:"+str(np.mean(erdes5))) 
        print("ERDE_30:"+str(np.mean(erdes30))) 
        print("Median latency:"+str(np.median(np.array(latency_tps)))) 
        print("Speed:"+str(_speed)) 
        print("latency-weightedF1:"+str(_latencyweightedF1)) 
        
        return {'Acuracy': accuracy, 'Macro_P': macro_precision, 'Macro_R': macro_recall,'Macro_F1': macro_f1,'Micro_P': micro_precision, 'Micro_R': micro_recall,
        'Micro_F1': micro_f1, 'ERDE5':np.mean(erdes5),'ERDE30': np.mean(erdes30), 'latencyTP': np.median(np.array(latency_tps)), 
        'speed': _speed, 'latency-weightedF1': _latencyweightedF1}

In [28]:
metrics_ = BinaryClassification("2", result, '/kaggle/input/test-gold-labels/task2_gold_a.csv')


149 lines read in qrels file!




In [29]:
metrics_.eval_performance()

EVALUATION:
Accuracy:0.7785234899328859
Macro precision:0.7941230997404524
Macro recall:0.7880355846042121
Macro f1:0.7781637717121588
Micro precision:0.7785234899328859
Micro recall:0.7785234899328859
Micro f1:0.7785234899328859
ERDE_5:0.536011891078397
ERDE_30:0.529302732530074
Median latency:1138.0
Speed:0.0
latency-weightedF1:0.0


  erdes5[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 5.0)))
  erdes30[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 30.0)))


{'Acuracy': 0.7785234899328859,
 'Macro_P': 0.7941230997404524,
 'Macro_R': 0.7880355846042121,
 'Macro_F1': 0.7781637717121588,
 'Micro_P': 0.7785234899328859,
 'Micro_R': 0.7785234899328859,
 'Micro_F1': 0.7785234899328859,
 'ERDE5': 0.536011891078397,
 'ERDE30': 0.529302732530074,
 'latencyTP': 1138.0,
 'speed': 0.0,
 'latency-weightedF1': 0.0}

In [30]:
# -- Zip best checkpoint
!zip /kaggle/working/results/checkpoint-36.zip /kaggle/working/results/checkpoint-36

  adding: kaggle/working/results/checkpoint-36/ (stored 0%)
