In [None]:
import pandas as pd
from transformers import AutoModel, AutoConfig, AutoTokenizer
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn  
import numpy as np

In [None]:
class CustomModel(nn.Module):

  def __init__(self,checkpoint,num_labels): 

    super(CustomModel,self).__init__() 
    self.num_labels = num_labels 

    self.model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
    self.dropout = nn.Dropout(0.1) 
    self.classifier = nn.Linear(768,21) 

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #utiliza el modelo para generar la salida
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #aplica el resto de capas
    sequence_output = self.dropout(outputs[0]) #outputs[0]=último estado
    logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calcula el error
    
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [None]:
model_name = "SamLowe/roberta-base-go_emotions"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=CustomModel(checkpoint= model_name, num_labels=21)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.load_state_dict(torch.load("model_state.bin"))
model.to(device)
model.eval()

In [None]:
import numpy as np
def clasificar_texto(texto):
    # Tokeniza el texto
    inputs = tokenizer(texto, return_tensors='pt').to(device)

    # Obtiene las predicciones del modelo
    outputs = model(**inputs)

    # Obtiene la probabilidad de cada clase
    probabilidades = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Obtiene la clase con la mayor probabilidad
    clase_predicha = torch.argmax(probabilidades)

    # Obtiene la relevancia (probabilidad de la clase predicha)
    relevancia = probabilidades[0, clase_predicha].item()

    relevancia_escalada = np.interp(relevancia, (0, 1), (0, 10))

    return clase_predicha.item() + 1, relevancia_escalada



In [None]:
def check_length(text):
    tokens = tokenizer.encode(text, truncation=False)
    return len(tokens) <= 512

In [None]:
"""df = pd.read_csv('/kaggle/input/classifier-model-data/classify_sliced/parte_1.csv')
df['text'] = df['text'].fillna('')
mask = df['text'].apply(check_length)
df = df[mask]
df['label'], df['relevance'] = zip(*df['text'].map(clasificar_texto))"""

In [None]:
"""# Ordena los textos por relevancia y los guarda en un CSV
df.sort_values(by='relevance', ascending=False).to_csv('/kaggle/working/results_1.csv', index=False)"""

# Métricas de Evaluación

In [None]:
from sklearn.metrics import classification_report
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('train.csv')
_ , X_val = train_test_split(train, test_size=0.15, random_state=42, stratify=train['label'])
np.random.seed(42)

In [None]:
dict_dataset= DatasetDict()
dict_dataset['test'] = Dataset.from_pandas(X_val)
print(dict_dataset)

In [None]:
dict_dataset = dict_dataset.remove_columns(['__index_level_0__', 'length', 'docid'])
MAX_LENGTH = 180

In [None]:
def get_prediction(texto):
    # Tokeniza el texto
    inputs = tokenizer(texto, return_tensors='pt').to(device)

    # Obtiene las predicciones del modelo
    outputs = model(**inputs)

    # Obtiene la probabilidad de cada clase
    probabilidades = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Obtiene la clase con la mayor probabilidad
    clase_predicha = torch.argmax(probabilidades)


    return clase_predicha.item() + 1


In [None]:
y_pred=[get_prediction(text) for text in dict_dataset['test']['text']]
y_true = dict_dataset['test']['label']

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score


In [None]:
print(classification_report(y_true=y_true, y_pred=y_pred))
micro_precision = precision_score(y_true, y_pred, average='micro')
micro_recall = recall_score(y_true, y_pred, average='micro')
micro_f1 = f1_score(y_true, y_pred, average='micro')
print(f'Micro Precision: {micro_precision}')
print(f'Micro Recall: {micro_recall}')
print(f'Micro F1-Score: {micro_f1}')