https://huggingface.co/hackathon-pln-es/electricidad-small-discriminator-finetuned-clasificacion-comentarios-suicidas

https://huggingface.co/mrm8488/electricidad-small-discriminator

https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb

In [1]:
!pip install transformers

In [2]:
import os
import numpy as np
import pandas as pd

import torch
import transformers

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from torch.utils.data import Dataset
from transformers import (ElectraForSequenceClassification, ElectraTokenizerFast,
                          InputFeatures, Trainer, TrainingArguments)

transformers.__version__
os.environ["WANDB_DISABLED"] = "true"

In [3]:
model = ElectraForSequenceClassification.from_pretrained(
    "mrm8488/electricidad-small-discriminator", num_labels=2)

tokenizer = ElectraTokenizerFast.from_pretrained(
    "mrm8488/electricidad-small-discriminator", do_lower_case=True, model_max_length=512)       

In [8]:
data = pd.read_csv("../input/depresin/twiter_agrupado.csv")
data.head(10)

In [19]:
data['depresion'] = data['depresion'].replace({'True': '1', 'False': '0'})
data.head()
data.tail()

In [11]:
X = data['texto_original']
y = data['depresion']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

In [13]:
training_sentences = X_train.to_list()
validation_sentences = X_test.to_list()
training_labels = y_train.to_list()
validation_labels = y_test.to_list()

In [14]:
class TrainerDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer

        # Tokenize the input
        self.tokenized_inputs = tokenizer(inputs, padding=True, truncation=True)   

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return InputFeatures(
            input_ids=self.tokenized_inputs['input_ids'][idx],
            token_type_ids=self.tokenized_inputs['token_type_ids'][idx],
            attention_mask=self.tokenized_inputs['attention_mask'][idx],
            label=self.targets[idx])

In [15]:
train_dataset = TrainerDataset(training_sentences,
                               training_labels, tokenizer)
eval_dataset = TrainerDataset(validation_sentences,
                              validation_labels, tokenizer)

In [29]:
# Set seed for reproducibility
np.random.seed(123)
torch.manual_seed(123)

training_args = TrainingArguments(
    output_dir="/kaggle/working/model_electra",
    num_train_epochs=5,  # 1 (1 epoch gives slightly lower accuracy)
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=32,
    dataloader_drop_last=True,  # Make sure all batches are of equal size
)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


# Instantiate the Trainer class
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  compute_metrics=compute_metrics)

In [30]:
trainer.train()

In [31]:
model_result = trainer.evaluate()
model_result

In [32]:
trainer.save_model()

In [33]:
from transformers import pipeline

In [36]:
model_name= "/kaggle/working/model_electra"
tokenizer_name = 'mrm8488/electricidad-small-discriminator'
cls = pipeline("text-classification", model=model_name, tokenizer=tokenizer_name)
cls("No puedo más")[0]['label']

In [37]:
cls("Hoy me encuentro fatal")[0]['label']

In [24]:
cls("Qué chungo")[0]['label']

In [28]:
cls("kdlo")[0]['label']