In [1]:
from dpm_preprocessing import DPMProprocessed
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.metrics import f1_score

device = torch.device("cuda"  if torch.cuda.is_available() else "cpu")

import os
os.environ["WANDB_DISABLED"] = "true"

model_name = 'roberta'
model_path = f'./models/pcl_{model_name}_finetuned/model/'
tokenizer_path = f'./models/pcl_{model_name}_finetuned/tokenizer/'
MAX_SEQ_LEN = 256

Collecting contractions
  Using cached contractions-0.1.66-py2.py3-none-any.whl (8.0 kB)
Collecting textsearch>=0.0.21
  Using cached textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Using cached pyahocorasick-1.4.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (109 kB)
Collecting anyascii
  Using cached anyascii-0.3.0-py3-none-any.whl (284 kB)
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions


ERROR: Could not install packages due to an OSError: [Errno 13] Permission denied: '/opt/miniconda3/lib/python3.9/site-packages/ahocorasick.cpython-39-x86_64-linux-gnu.so'
Consider using the `--user` option or check the permissions.



In [2]:
class PCLDataset(torch.utils.data.Dataset):

    def __init__(self, tokenizer, input_set):

        self.tokenizer = tokenizer
        self.texts = list(input_set['text'])
        self.labels = list(input_set['label'])
        
    def collate_fn(self, batch):

        texts = []
        labels = []

        for b in batch:
            texts.append(b['text'])
            labels.append(b['label'])

        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=MAX_SEQ_LEN)
        encodings['labels'] =  torch.tensor(labels)
        return encodings
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
       
        item = {'text': self.texts[idx],
                'label': self.labels[idx]}
        return item

In [3]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base").to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [4]:
dpm_pp = DPMProprocessed('.', 'task4_test.tsv')
train_df, val_df = dpm_pp.get_unbalanced_split(0.2)

print("Training set length: ",len(train_df))
print("Validation set length: ",len(val_df))

train_dataset = PCLDataset(tokenizer, train_df)
eval_dataset = PCLDataset(tokenizer, val_df)


Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}
      par_id      art_id     keyword country  \
0          1  @@24942188    hopeless      ph   
1          2  @@21968160     migrant      gh   
2          3  @@16584954   immigrant      ie   
3          4   @@7811231    disabled      nz   
4          5   @@1494111     refugee      ca   
...      ...         ...         ...     ...   
10464  10465  @@14297363       women      lk   
10465  10466  @@70091353  vulnerable      ph   
10466  10467  @@20282330     in-need      ng   
10467  10468  @@16753236    hopeless      in   
10468  10469  @@16779383    homeless      ie   

                                                    text  label orig_label  \
0      We are living in times of absolute insanity , ...      0          0   
1      In Libya today , there are countless number of...      0         

In [5]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 10.0]).to(device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        if len(logits) != len(labels):
            print(1)
        return ((loss, outputs) if return_outputs else loss)

In [6]:
validation_loader = DataLoader(eval_dataset)
def compute_metric_eval(arg):
    logits, labels_gold = arg[0], arg[1]
    labels_pred = np.argmax(logits, axis = 1)
    return {'f1_macro' :f1_score(labels_gold, labels_pred, average='macro') } #more metrics can be added

training_args = TrainingArguments(
        output_dir='./experiment/pcl',
        learning_rate = 0.0001,
        logging_steps= 100,
        per_device_train_batch_size=12,
        per_device_eval_batch_size = 12,
        num_train_epochs = 3,
        evaluation_strategy= "steps"
        )

trainer = CustomTrainer(
        model=model,                         
        args=training_args,                 
        train_dataset=train_dataset,                   
        data_collator=eval_dataset.collate_fn,
        compute_metrics = compute_metric_eval,
        eval_dataset = eval_dataset
    )
trainer.train()

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 8375
  Num Epochs = 3
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 2094
  5%|▍         | 100/2094 [00:13<04:26,  7.49it/s]***** Running Evaluation *****
  Num examples = 2094
  Batch size = 12


{'loss': 0.7189, 'learning_rate': 9.522445081184336e-05, 'epoch': 0.14}


                                                  
  5%|▍         | 101/2094 [00:19<52:35,  1.58s/it]

{'eval_loss': 0.6762136816978455, 'eval_f1_macro': 0.0, 'eval_runtime': 5.9298, 'eval_samples_per_second': 353.133, 'eval_steps_per_second': 29.512, 'epoch': 0.14}


 10%|▉         | 200/2094 [00:30<03:51,  8.19it/s]***** Running Evaluation *****
  Num examples = 2094
  Batch size = 12


{'loss': 0.7153, 'learning_rate': 9.044890162368673e-05, 'epoch': 0.29}


                                                  
 10%|▉         | 201/2094 [00:36<51:16,  1.63s/it]

{'eval_loss': 0.6986951231956482, 'eval_f1_macro': 0.17357174007849976, 'eval_runtime': 5.7691, 'eval_samples_per_second': 362.971, 'eval_steps_per_second': 30.334, 'epoch': 0.29}


 14%|█▍        | 300/2094 [00:48<03:21,  8.91it/s]***** Running Evaluation *****
  Num examples = 2094
  Batch size = 12


{'loss': 0.7158, 'learning_rate': 8.567335243553009e-05, 'epoch': 0.43}


                                                  
 14%|█▍        | 301/2094 [00:54<53:48,  1.80s/it]

{'eval_loss': 0.6761119961738586, 'eval_f1_macro': 0.0, 'eval_runtime': 5.7891, 'eval_samples_per_second': 361.715, 'eval_steps_per_second': 30.229, 'epoch': 0.43}


 19%|█▉        | 400/2094 [01:06<03:11,  8.85it/s]***** Running Evaluation *****
  Num examples = 2094
  Batch size = 12


{'loss': 0.7119, 'learning_rate': 8.089780324737345e-05, 'epoch': 0.57}


                                                  
 19%|█▉        | 401/2094 [01:12<34:48,  1.23s/it]

{'eval_loss': 0.684873104095459, 'eval_f1_macro': 0.0, 'eval_runtime': 5.7854, 'eval_samples_per_second': 361.946, 'eval_steps_per_second': 30.249, 'epoch': 0.57}


 24%|██▍       | 500/2094 [01:23<02:52,  9.25it/s]***** Running Evaluation *****
  Num examples = 2094
  Batch size = 12


{'loss': 0.684, 'learning_rate': 7.612225405921681e-05, 'epoch': 0.72}


                                                  
 24%|██▍       | 500/2094 [01:29<02:52,  9.25it/s]Saving model checkpoint to ./experiment/pcl/checkpoint-500
Configuration saved in ./experiment/pcl/checkpoint-500/config.json


{'eval_loss': 0.6759063601493835, 'eval_f1_macro': 0.0, 'eval_runtime': 5.7746, 'eval_samples_per_second': 362.625, 'eval_steps_per_second': 30.305, 'epoch': 0.72}


Model weights saved in ./experiment/pcl/checkpoint-500/pytorch_model.bin
 29%|██▊       | 600/2094 [01:43<02:51,  8.70it/s]  ***** Running Evaluation *****
  Num examples = 2094
  Batch size = 12


{'loss': 0.7013, 'learning_rate': 7.134670487106018e-05, 'epoch': 0.86}




KeyboardInterrupt: 

In [None]:
trainer.save_model(model_path)
tokenizer.save_pretrained(tokenizer_path)

train_df.to_pickle('train_df.pickle')
val_df.to_pickle('val_df.pickle')


In [None]:
model = RobertaForSequenceClassification.from_pretrained(model_path).to(device)
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

train_df = pd.read_pickle('train_df.pickle')
val_df = pd.read_pickle('val_df.pickle')

train_dataset = PCLDataset(tokenizer, train_df)
eval_dataset = PCLDataset(tokenizer, val_df)

In [None]:
def predict_pcl(input, tokenizer, model): 
  model.eval()
  encodings = tokenizer(input, return_tensors='pt', padding=True, truncation=True, max_length=256)
  encodings = encodings.to(device)
  output = model(**encodings)
  logits = output.logits
  preds = torch.max(logits, 1)

  return {'prediction':preds[1], 'confidence':preds[0]}

def evaluate(model, tokenizer, data_loader):

  preds = []
  tot_labels = []

  with torch.no_grad():
    for data in (data_loader): 

      labels = {}
      labels['label'] = data['label']

      tweets = data['text']

      pred = predict_pcl(tweets, tokenizer, model)

      preds.append(np.array(pred['prediction'].cpu()))
      tot_labels.append(np.array(labels['label'].cpu()))

  # with the saved predictions and labels we can compute accuracy, precision, recall and f1-score
  

  return preds, tot_labels

In [None]:
validation_loader = DataLoader(eval_dataset)

preds, tot_labels = evaluate(model, tokenizer, validation_loader)
tot_labels = np.array(tot_labels)
preds = np.array(preds)
report = classification_report(tot_labels, preds, target_names=["Not PCL","PCL"], output_dict= True)
print(report)

print(report['accuracy'])
print(report['Not PCL']['f1-score'])
print(report['PCL']['f1-score'])