In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import plotly.express as plx
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import transformers
import torch
import datasets
from datasets import Dataset, DatasetDict


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [3]:
path = 'C:/Users/olver/OneDrive/Escritorio/Data_Suicide/Suicide_Detection.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [4]:
#pasamos a binario la columna labels
data['class'] = data['class'].replace({'suicide' : 1, 'non-suicide': 0})
data.head()


Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,1
1,3,Am I weird I don't get affected by compliments...,0
2,4,Finally 2020 is almost over... So I can never ...,0
3,8,i need helpjust help me im crying so hard,1
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",1


In [5]:
data = data[['class', 'text']]
data.columns = ['labels', 'text' ]
data = data.sample(frac=0.01)

data.head()

Unnamed: 0,labels,text
78387,0,i guess i lost everyone again no more friends ...
40795,0,I fucking hate my parents I dont remember the ...
38535,1,"Delaying going to the psych ward.First of all,..."
85617,0,Give people advice that you can’t follow yours...
197330,0,so uhhmmm there's people in here that need a h...


In [6]:
data['labels'].value_counts()

labels
0    1188
1    1133
Name: count, dtype: int64

In [7]:
#obtenemos el 80% para entrenamiento
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(data, test_size=0.2, random_state=7)

#Eliminamos el indice
df_train 

tds = Dataset.from_pandas(df_train)
vds = Dataset.from_pandas(df_test)

ds = DatasetDict()

ds['train'] = tds
ds['validation'] = vds

In [8]:
ds = ds.remove_columns('__index_level_0__')
ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 1856
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 465
    })
})

In [9]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = ds.map(tokenize_function, batched=True)




Map: 100%|██████████| 1856/1856 [00:00<00:00, 2761.28 examples/s]
Map: 100%|██████████| 465/465 [00:00<00:00, 2800.58 examples/s]


In [10]:
data_collator = DataCollatorWithPadding(tokenizer)

In [11]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments
from transformers import Trainer

training_args = TrainingArguments(output_dir="test_trainer",
                                  per_device_eval_batch_size=4,
                                  per_device_train_batch_size=4,
                                  num_train_epochs=3,
                                  learning_rate=2e-5,
                                  weight_decay=0.01)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer= tokenizer
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [13]:
trainer.train()

 36%|███▌      | 500/1392 [21:51<38:50,  2.61s/it] 

{'loss': 0.2846, 'grad_norm': 0.07734140008687973, 'learning_rate': 1.281609195402299e-05, 'epoch': 1.08}


 72%|███████▏  | 1000/1392 [44:11<17:06,  2.62s/it] 

{'loss': 0.1247, 'grad_norm': 0.028477810323238373, 'learning_rate': 5.6321839080459775e-06, 'epoch': 2.16}


100%|██████████| 1392/1392 [1:01:46<00:00,  2.61s/it]

{'train_runtime': 3705.9658, 'train_samples_per_second': 1.502, 'train_steps_per_second': 0.376, 'train_loss': 0.16365412734020715, 'epoch': 3.0}


100%|██████████| 1392/1392 [1:01:46<00:00,  2.66s/it]


TrainOutput(global_step=1392, training_loss=0.16365412734020715, metrics={'train_runtime': 3705.9658, 'train_samples_per_second': 1.502, 'train_steps_per_second': 0.376, 'train_loss': 0.16365412734020715, 'epoch': 3.0})

In [14]:
trainer.evaluate()

100%|██████████| 117/117 [07:37<00:00,  3.91s/it]


{'eval_loss': 0.19593356549739838,
 'eval_runtime': 462.1181,
 'eval_samples_per_second': 1.006,
 'eval_steps_per_second': 0.253,
 'epoch': 3.0}

In [15]:
#guardamos el modelo
model.save_pretrained('transformer_model')

In [19]:
#obtenemos las predicciones
predictions = trainer.predict(tokenized_datasets['validation'])
print(predictions.predictions.shape, predictions.label_ids.shape)


100%|██████████| 117/117 [07:30<00:00,  3.85s/it]

(465, 2) (465,)





In [20]:
#obtenemos la matriz de confusion
from sklearn.metrics import confusion_matrix
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

cm = confusion_matrix(y_true, y_pred)
cm


array([[209,  15],
       [  9, 232]], dtype=int64)

In [18]:
#imprimimos las matrices de confusion
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def get_predictions(model, dataset):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()
    predictions = []
    labels = []
    for i in tqdm(range(len(dataset))):
        prediction = model(**{k: torch.tensor(v[i]).to(device) for k, v in dataset.items()}).logits
        predictions.append(prediction.argmax().item())
        labels.append(dataset['labels'][i])
    return predictions, labels

predictions, labels = get_predictions(model, tokenized_datasets['validation'])

print(classification_report(labels, predictions))


  0%|          | 0/465 [00:00<?, ?it/s]


AttributeError: 'Dataset' object has no attribute 'items'