In [20]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

from tqdm import trange

import random

torch.cuda.empty_cache()

In [10]:
data = pd.read_csv('/home/cesarms/Documents/CIC/Doctorado/HomoMex/corpus/preprocessed/train/task1_preprocessed.csv')

label_map = {'P' : 0,
             'NP' : 1,
             np.nan : 2}

data.label.replace(label_map, inplace= True)
data = data.astype({'tweets' : 'str', 'label' : 'int'})
data.head()

Unnamed: 0,tweets,label
0,me quise ligar a una chava ayer y no me pelo l...,0
1,eres un puñal papayita,0
2,magnate ofrece mdd al hombre que conquiste a s...,0
3,los trolebuses del desgobierno de son idiotas ...,0
4,en epoca de hitler no se decia eres gay y si e...,0


In [3]:
def preprocess(input_text, tokenizer):
    return tokenizer.encode_plus(input_text,
                                 add_special_tokens = True,
                                 max_length = 512,
                                 pad_to_max_length = True,
                                 return_attention_mask = True,
                                 return_tensors = 'pt')

In [5]:
texts = list(data.tweets)
labels = list(data.label)

In [4]:
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased',
                                          do_lower_case = True)

In [7]:
token_id = []
attention_masks = []

for sample in texts:
    encoding_dict = preprocess(sample, tokenizer)
    token_id.append(encoding_dict['input_ids'])
    attention_masks.append(encoding_dict['attention_mask'])

token_id = torch.cat(token_id, dim= 0)
attention_masks = torch.cat(attention_masks, dim= 0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [8]:
val_ratio = 0.05
batch_size = 8
random_state = 42

train_idx, val_idx = train_test_split(np.arange(len(labels)),
                                      test_size= val_ratio,
                                      shuffle= True,
                                      stratify= labels,
                                      random_state= random_state)

# train and validation sets
train_set = TensorDataset(token_id[train_idx],
                          attention_masks[train_idx],
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx],
                        attention_masks[val_idx],
                        labels[val_idx])

# Dataloaders

train_dataloader = DataLoader(train_set,
                              sampler= RandomSampler(train_set),
                              batch_size = batch_size)

validation_dataloader = DataLoader(val_set,
                                   sampler= SequentialSampler(val_set),
                                   batch_size= batch_size)

In [None]:
model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased',
                                                      num_labels = len(label_map),
                                                      output_attentions = False,
                                                      output_hidden_states = False)

In [15]:
optimizer = AdamW(model.parameters(),
                   lr = 1e-7,
                   eps= 1e-9)

model.cuda()

epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps= 0,
                                           num_training_steps= len(train_dataloader)*epochs)



In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [29]:
def b_metrics(preds, labels):
    preds_flat = np.argmax(preds, axis= 1).flatten()
    labels_flat = labels.flatten()

    b_accuracy = accuracy_score(labels_flat, preds_flat)
    b_precision = precision_score(labels_flat, preds_flat, average= 'macro')
    b_recall = recall_score(labels_flat, preds_flat, average= 'macro')
    b_f1 = f1_score(labels_flat, preds_flat, average= 'macro')

    return b_accuracy, b_precision, b_recall, b_f1

In [30]:
seed_val = 42
random.seed(seed_val)

def evaluate(dataloader):
    model.eval()

    predictions, true_vals = [], []

    loss_val_total = 0

    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)

        inputs = {
            'input_ids' : batch[0],
            'attention_mask' : batch[1],
            'labels' : batch[2]
        }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader)

    predictions = np.concatenate(predictions, axis = 0)
    true_vals = np.concatenate(true_vals, axis = 0)

    return loss_val_avg, predictions, true_vals

In [31]:
from tqdm.notebook import tqdm

In [32]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()

    train_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    progress_bar = tqdm(train_dataloader,
                        desc= 'Epoch {:1d}'.format(epoch),
                        leave= False,
                        disable= False)

    for batch in progress_bar:
        model.zero_grad()

        batch = tuple(t.to(device) for t in batch)

        inputs = {
            'input_ids' : batch[0],
            'attention_mask' : batch[1],
            'labels' : batch[2]
        }

        train_outputs = model(**inputs)

        loss = train_outputs[0]
        train_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()


        progress_bar.set_postfix({'training_loss' : '{:3f}'.format(loss.item() / len(batch))})

    torch.save(model.state_dict(), f'data_volume/finetuned_BETO_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')
    val_loss, predictions, true_vals = evaluate(validation_dataloader)
    val_accuracy, val_precision, val_recall, val_f1 = b_metrics(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'Accuracy: {val_accuracy}')
    tqdm.write(f'Precision (Macro): {val_precision}')
    tqdm.write(f'Recall (macro): {val_recall}')
    tqdm.write(f'F1-score (macro): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/832 [00:00<?, ?it/s]


Epoch 1
Validation loss: 0.868015127425844
Accuracy: 0.6285714285714286
Precision (Macro): 0.5421455938697318
Recall (macro): 0.3408239700374532
F1-score (macro): 0.27142468838581907


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2:   0%|          | 0/832 [00:00<?, ?it/s]


Epoch 2
Validation loss: 0.8533325561068275
Accuracy: 0.6314285714285715
Precision (Macro): 0.542747358309318
Recall (macro): 0.3445692883895131
F1-score (macro): 0.27896626907785044


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3:   0%|          | 0/832 [00:00<?, ?it/s]


Epoch 3
Validation loss: 0.8460382148623466
Accuracy: 0.6314285714285715
Precision (Macro): 0.542747358309318
Recall (macro): 0.3445692883895131
F1-score (macro): 0.27896626907785044


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 4:   0%|          | 0/832 [00:00<?, ?it/s]


Epoch 4
Validation loss: 0.8436419347470457
Accuracy: 0.6314285714285715
Precision (Macro): 0.542747358309318
Recall (macro): 0.3445692883895131
F1-score (macro): 0.27896626907785044


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 5:   0%|          | 0/832 [00:00<?, ?it/s]


Epoch 5
Validation loss: 0.8436419347470457
Accuracy: 0.6314285714285715
Precision (Macro): 0.542747358309318
Recall (macro): 0.3445692883895131
F1-score (macro): 0.27896626907785044


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased',
                                                      num_labels = len(label_map),
                                                      output_attentions = False,
                                                      output_hidden_states = False)

model.to(device)

model.load_state_dict(torch.load('data_volume/finetuned_BETO_epoch_5.model', map_location= torch.device('cpu')))



Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

<All keys matched successfully>

In [7]:
test_set = pd.read_csv('/home/cesarms/Documents/CIC/Doctorado/HomoMex/corpus/preprocessed/test/test_data_preprocessed_homomex_track_1.csv')
test_set.head()

Unnamed: 0,content
0,sera que mi perrita es lesbiana o
1,quiero una amiga lencha te vaz a mudar y no ti...
2,ash la jotita del programa hoy no deja de deci...
3,en una boda gay les avientan arroz con popote
4,sos mas facil que la tabla del puta


In [14]:
test_texts = list(test_set.content)
test_texts[0]

'sera que mi perrita es lesbiana o'

In [16]:
token_id_test = []
attention_masks_test = []

for sample in test_texts:
    encoding_dict = preprocess(sample, tokenizer)
    token_id_test.append(encoding_dict['input_ids'])
    attention_masks_test.append(encoding_dict['attention_mask'])

token_id_test = torch.cat(token_id_test, dim= 0)
attention_masks_test = torch.cat(attention_masks_test, dim= 0)



In [19]:
batch_size = 8
test_corpus = TensorDataset(token_id_test,
                            attention_masks_test)
 
test_dataloader = DataLoader(test_corpus,
                              sampler= RandomSampler(test_corpus),
                              batch_size = batch_size)

In [45]:
def obtain_label(text, tokenizer):
    test_ids = []
    test_attention_mask = []
    
    #apply the tokenizer
    encoding = preprocess(text, tokenizer)

    #Extract ids and attention mask
    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim= 0)
    test_attention_mask = torch.cat(test_attention_mask, dim= 0)

    # Forward pass

    with torch.no_grad():
        output = model(test_ids.to(device),
                       token_type_ids = None,
                       attention_mask = test_attention_mask.to(device))
    
    prediction = np.argmax(output.logits.cpu().numpy(), axis= 1).flatten().item()

    return prediction    

In [46]:
predictions = []

for tweet in test_texts:
    prediction = obtain_label(tweet, tokenizer)

    predictions.append(prediction)
    



In [59]:
test_predictions = pd.DataFrame()
test_predictions['task_name'] = ["LGBTphobiaDetectionMultiClass" for i in range(len(predictions))]
test_predictions['identifier'] = [f"{i + 1}" for i in range(len(predictions))]
test_predictions['class'] = predictions
test_predictions.head()

Unnamed: 0,task_name,identifier,class
0,LGBTphobiaDetectionMultiClass,1,1
1,LGBTphobiaDetectionMultiClass,2,1
2,LGBTphobiaDetectionMultiClass,3,1
3,LGBTphobiaDetectionMultiClass,4,1
4,LGBTphobiaDetectionMultiClass,5,1


In [60]:
test_predictions['class'].value_counts()

class
1    3970
2      30
Name: count, dtype: int64

In [61]:
class_map = {0 : "P",
             1 : "NP",
             2 : "NA"}
test_predictions['class'].replace(class_map, inplace=True)
test_predictions.head()

Unnamed: 0,task_name,identifier,class
0,LGBTphobiaDetectionMultiClass,1,NP
1,LGBTphobiaDetectionMultiClass,2,NP
2,LGBTphobiaDetectionMultiClass,3,NP
3,LGBTphobiaDetectionMultiClass,4,NP
4,LGBTphobiaDetectionMultiClass,5,NP


In [62]:
test_predictions['class'].value_counts()

class
NP    3970
NA      30
Name: count, dtype: int64

In [64]:
test_predictions.to_csv('beto_predictions_task_1.txt', sep= '\t', index= False, header= False)

In [65]:
str_preds = []
for pred in predictions:
    if pred == 0:
        p = "P"
    elif pred == 1:
        p = "NP"
    elif pred == 2:
        p = "NA"
    
    str_preds.append(p)


In [66]:
str_preds[0]

'NP'

In [91]:
rows = []
for i, pred in enumerate(str_preds):
    row = '"LGBTphobiaDetectionMultiClass"' + '\t' + f'"{i+1}"' + '\t' + f'"{pred}"' + '\n'

    rows.append(row)

In [92]:
rows[:3]

['"LGBTphobiaDetectionMultiClass"\t"1"\t"NP"\n',
 '"LGBTphobiaDetectionMultiClass"\t"2"\t"NP"\n',
 '"LGBTphobiaDetectionMultiClass"\t"3"\t"NP"\n']

In [94]:
with open('beto_predictions_task1.txt', 'w') as f:
    f.writelines(rows)