# Data Preparation

vamos a leer los datos de test y a montarlos en un csv para poder trabajar con ellos

In [None]:
import lxml as ET
import pandas as pd
import numpy as np
import os
import re
pd.set_option('display.max_colwidth', None)

hay caracteres especiales (&) y no imprimibles en el árbol xml, por lo que hay que tratarlos antes.

In [None]:
"""def corregir_caracteres_especiales(archivo):
    with open(archivo, 'r', encoding='utf8') as entrada:
        contenido = '<ROOT>' + entrada.read() + '</ROOT>'
    # Reemplaza '&' que no son parte de un XML con '&amp; y los caracteres no imprimibles con ''
    contenido_corregido = re.sub(r'&(?!(amp|lt|gt|apos|quot);)', '&amp;', contenido)
    contenido_corregido = re.sub(r'[\x00-\x1F\x7F]','', contenido_corregido)
    with open(archivo, 'w', encoding='utf8') as salida:
        salida.write(contenido_corregido)"""

In [None]:
"""def procesar_directorio(directorio):
    for nombre_archivo in os.listdir(directorio):
        corregir_caracteres_especiales(os.path.join(directorio, nombre_archivo))
        
procesar_directorio('T1/test/data/')"""

In [None]:
#data_list = []

In [None]:
"""directory = 'T1/test/data/'
# Recorrer todos los archivos en el directorio
for path in os.listdir(directory):
    filename = os.path.join(directory, path)
    try:
        tree = ET.parse(filename)
        root = tree.getroot()
        for doc in root.findall('DOC'):
            docid = doc.find('DOCNO').text
            text = str(doc.find('TEXT').text).strip()
            new_data= [docid,text]
            data_list.append(new_data)
    except Exception as e:
        print(f"Error processing file {filename}: {e}")"""

In [None]:
#classify = pd.DataFrame(data_list, columns=['docid', 'text'])

In [None]:
#classify["length"]=classify['text'].apply(lambda x: len(x.split()))

In [None]:
#classify['text'] = classify['text'].fillna('')

In [None]:
#classify.to_csv('classify.csv', index=False)

In [None]:
classify = pd.read_csv('./classify.csv')

In [None]:
classify.head()

In [None]:
classify.sample(10)

In [None]:
classify['length'].describe(percentiles=[0, 0.25, 0.50, 0.75, 0.95])

podemos ver como el máxmimo son elementos que por lo general no aportan demasiada información.

In [None]:
fila_interes = classify[classify['length'] == 6588]
print(fila_interes)

In [None]:
#relevant = pd.read_csv('T1/train/relevant_texts.csv')

In [None]:
#relevant.drop(columns=['tokens','length_tokens'], inplace=True)

In [None]:
#relevant = relevant.rename(columns={'length_text':'length', 'symptom':'label'})

In [None]:
#relevant.head()

In [None]:
#relevant.to_csv('train.csv', index=False)

In [None]:
train = pd.read_csv('train.csv')

In [None]:
train.head()

In [None]:
train.sample(10)

In [None]:
train['length'].describe(percentiles=[0, 0.25, 0.50, 0.75, 0.95])

# Models

In [None]:
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val = train_test_split(train, test_size=0.15, random_state=42, stratify=train['label'])
np.random.seed(42)

In [None]:
dict_dataset= DatasetDict()
dict_dataset['train'] = Dataset.from_pandas(X_train )
dict_dataset['validation'] = Dataset.from_pandas(X_val)
print(dict_dataset)

In [None]:
dict_dataset = dict_dataset.remove_columns(['__index_level_0__', 'length', 'docid'])

In [None]:
dict_dataset

## SamLowe/roberta-base-go_emotions

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [None]:
model_name = "SamLowe/roberta-base-go_emotions"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
MAX_LENGTH= max([len(tokenizer(text).input_ids) for text in dict_dataset['train']['text']])
print("La longitud máxima de la secuencia es: ", MAX_LENGTH)

MAX_LENGTH = min(512, MAX_LENGTH)
print("max_length", MAX_LENGTH)

tokenizer.model_max_len=MAX_LENGTH

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def tokenize(batch):
  return tokenizer(batch["text"], truncation=True, padding=True, max_length=MAX_LENGTH)

encoded_dataset = dict_dataset.map(tokenize, batched=True)
encoded_dataset.set_format("torch",columns=["input_ids", "attention_mask", "label"])
encoded_dataset

In [None]:
from transformers import AutoModel, AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn       #layes for NN

class CustomModel(nn.Module):

  def __init__(self,checkpoint,num_labels): 

    super(CustomModel,self).__init__() 
    self.num_labels = num_labels 

    self.model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
    self.dropout = nn.Dropout(0.1) 
    self.classifier = nn.Linear(768,21) 

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #utiliza el modelo para generar la salida
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #aplica el resto de capas
    sequence_output = self.dropout(outputs[0]) #outputs[0]=último estado
    logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calcula el error
    
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=CustomModel(checkpoint= model_name, num_labels=21).to(device)

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(encoded_dataset["train"], shuffle=True, batch_size=32, collate_fn=data_collator)

eval_dataloader = DataLoader(encoded_dataset["validation"], batch_size=32, collate_fn=data_collator)

In [None]:
from transformers import get_scheduler
import evaluate 

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

In [None]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))

best_loss = float('inf')  # Inicializa con infinito


for epoch in range(num_epochs):
  model.train()
  for batch in train_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      batch["labels"] = batch["labels"] - 1
      outputs = model(**batch)
      loss = outputs.loss
      loss.backward()

      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar_train.update(1)

  model.eval()
  total_loss = 0
  num_batches = 0
  for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    batch["labels"] = batch["labels"] - 1
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    f1.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar_eval.update(1)
    total_loss += outputs.loss.item()
    num_batches += 1
  
  avg_loss = total_loss / num_batches
  print(avg_loss)
  print(f1.compute(average='micro'))
  print(precision.compute(average='micro'))
  print(recall.compute(average='micro'))

  if avg_loss < best_loss:
    best_loss = avg_loss
    torch.save(model.state_dict(), "model_state.bin")