In [1]:
from dpm_preprocessing import DPMProprocessed
import torch
#from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, RobertaConfig
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.metrics import f1_score

device = torch.device("cuda"  if torch.cuda.is_available() else "cpu")

import os
os.environ["WANDB_DISABLED"] = "true"

#model_name = 'roberta-base'
model_name = 'bert-base-uncased'
model_path = f'./models/pcl_{model_name}_finetuned/model/'
tokenizer_path = f'./models/pcl_{model_name}_finetuned/tokenizer/'
MAX_SEQ_LEN = 256

Collecting contractions
  Using cached contractions-0.1.66-py2.py3-none-any.whl (8.0 kB)
Collecting textsearch>=0.0.21
  Using cached textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Using cached anyascii-0.3.0-py3-none-any.whl (284 kB)
Collecting pyahocorasick
  Using cached pyahocorasick-1.4.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (109 kB)
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions


ERROR: Could not install packages due to an OSError: [Errno 13] Permission denied: '/opt/miniconda3/lib/python3.9/site-packages/ahocorasick.cpython-39-x86_64-linux-gnu.so'
Consider using the `--user` option or check the permissions.



In [2]:
class PCLDataset(torch.utils.data.Dataset):

    def __init__(self, tokenizer, input_set):

        self.tokenizer = tokenizer
        self.texts = list(input_set['text'])
        self.labels = list(input_set['label'])
        
    def collate_fn(self, batch):

        texts = []
        labels = []

        for b in batch:
            texts.append(b['text'])
            labels.append(b['label'])

        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=MAX_SEQ_LEN)
        encodings['labels'] =  torch.tensor(labels)
        return encodings
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
       
        item = {'text': self.texts[idx],
                'label': self.labels[idx]}
        return item

In [3]:
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name , config = config).to(device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:
dpm_pp = DPMProprocessed('.', 'task4_test.tsv')
train_df, val_df = dpm_pp.get_unbalanced_split(0.1)

print("Training set length: ",len(train_df))
print("Validation set length: ",len(val_df))

train_dataset = PCLDataset(tokenizer, train_df)
eval_dataset = PCLDataset(tokenizer, val_df)


Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}
      par_id      art_id     keyword country  \
0          1  @@24942188    hopeless      ph   
1          2  @@21968160     migrant      gh   
2          3  @@16584954   immigrant      ie   
3          4   @@7811231    disabled      nz   
4          5   @@1494111     refugee      ca   
...      ...         ...         ...     ...   
10464  10465  @@14297363       women      lk   
10465  10466  @@70091353  vulnerable      ph   
10466  10467  @@20282330     in-need      ng   
10467  10468  @@16753236    hopeless      in   
10468  10469  @@16779383    homeless      ie   

                                                    text  label orig_label  \
0      We are living in times of absolute insanity , ...      0          0   
1      In Libya today , there are countless number of...      0         

In [5]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 10.0]).to(device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return ((loss, outputs) if return_outputs else loss)

In [6]:
validation_loader = DataLoader(eval_dataset)
def compute_metric_eval(arg):
    logits, labels_gold = arg[0], arg[1]
    labels_pred = np.argmax(logits, axis = 1)
    return {'f1_macro' :f1_score(labels_gold, labels_pred, average='macro') } #more metrics can be added

training_args = TrainingArguments(
        output_dir=f'./experiment/pcl/{model_name}/',
        learning_rate = 5e-6,
        logging_steps= 100,
        eval_steps = 400,
        per_device_train_batch_size=12,
        per_device_eval_batch_size = 12,
        num_train_epochs = 20,
        evaluation_strategy= "steps"
        )

trainer = CustomTrainer(
        model=model,                         
        args=training_args,                 
        train_dataset=train_dataset,                   
        data_collator=eval_dataset.collate_fn,
        compute_metrics = compute_metric_eval,
        eval_dataset = eval_dataset
    )
trainer.train()

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 9422
  Num Epochs = 20
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 15720
  1%|          | 101/15720 [00:11<27:36,  9.43it/s]

{'loss': 0.6827, 'learning_rate': 4.968193384223919e-06, 'epoch': 0.13}


  1%|▏         | 201/15720 [00:22<28:08,  9.19it/s]

{'loss': 0.6294, 'learning_rate': 4.9363867684478375e-06, 'epoch': 0.25}


  2%|▏         | 301/15720 [00:35<31:16,  8.22it/s]

{'loss': 0.5551, 'learning_rate': 4.904580152671756e-06, 'epoch': 0.38}


  3%|▎         | 400/15720 [00:46<27:37,  9.24it/s]***** Running Evaluation *****
  Num examples = 1047
  Batch size = 12


{'loss': 0.5456, 'learning_rate': 4.8727735368956745e-06, 'epoch': 0.51}


                                                   
  3%|▎         | 401/15720 [00:49<2:49:39,  1.50it/s]

{'eval_loss': 0.470233291387558, 'eval_f1_macro': 0.683393048593866, 'eval_runtime': 2.9518, 'eval_samples_per_second': 354.697, 'eval_steps_per_second': 29.812, 'epoch': 0.51}


  3%|▎         | 500/15720 [01:00<27:54,  9.09it/s]  Saving model checkpoint to ./experiment/pcl/bert-base-uncased/checkpoint-500
Configuration saved in ./experiment/pcl/bert-base-uncased/checkpoint-500/config.json


{'loss': 0.5164, 'learning_rate': 4.840966921119593e-06, 'epoch': 0.64}


Model weights saved in ./experiment/pcl/bert-base-uncased/checkpoint-500/pytorch_model.bin
  4%|▍         | 601/15720 [01:13<26:01,  9.68it/s]  

{'loss': 0.5532, 'learning_rate': 4.8091603053435125e-06, 'epoch': 0.76}


  4%|▍         | 701/15720 [01:25<32:45,  7.64it/s]

{'loss': 0.4567, 'learning_rate': 4.77735368956743e-06, 'epoch': 0.89}


  5%|▌         | 800/15720 [01:36<26:41,  9.31it/s]***** Running Evaluation *****
  Num examples = 1047
  Batch size = 12


{'loss': 0.5189, 'learning_rate': 4.745547073791349e-06, 'epoch': 1.02}


                                                   
  5%|▌         | 801/15720 [01:39<3:02:43,  1.36it/s]

{'eval_loss': 0.5713419318199158, 'eval_f1_macro': 0.7437912588208513, 'eval_runtime': 2.9571, 'eval_samples_per_second': 354.069, 'eval_steps_per_second': 29.759, 'epoch': 1.02}


  6%|▌         | 901/15720 [01:51<27:22,  9.02it/s]  

{'loss': 0.5693, 'learning_rate': 4.713740458015267e-06, 'epoch': 1.15}


  6%|▋         | 1000/15720 [02:02<29:55,  8.20it/s]Saving model checkpoint to ./experiment/pcl/bert-base-uncased/checkpoint-1000
Configuration saved in ./experiment/pcl/bert-base-uncased/checkpoint-1000/config.json


{'loss': 0.4529, 'learning_rate': 4.681933842239187e-06, 'epoch': 1.27}


Model weights saved in ./experiment/pcl/bert-base-uncased/checkpoint-1000/pytorch_model.bin
  7%|▋         | 1102/15720 [02:16<24:46,  9.84it/s]  

{'loss': 0.4889, 'learning_rate': 4.650127226463105e-06, 'epoch': 1.4}


  8%|▊         | 1200/15720 [02:27<27:46,  8.71it/s]***** Running Evaluation *****
  Num examples = 1047
  Batch size = 12


{'loss': 0.4179, 'learning_rate': 4.618320610687023e-06, 'epoch': 1.53}


                                                    
  8%|▊         | 1201/15720 [02:30<3:41:04,  1.09it/s]

{'eval_loss': 0.5557230710983276, 'eval_f1_macro': 0.7540162838741037, 'eval_runtime': 2.9567, 'eval_samples_per_second': 354.117, 'eval_steps_per_second': 29.763, 'epoch': 1.53}


  8%|▊         | 1302/15720 [02:42<25:34,  9.40it/s]  

{'loss': 0.5082, 'learning_rate': 4.586513994910941e-06, 'epoch': 1.65}


  9%|▉         | 1400/15720 [02:53<25:50,  9.24it/s]

{'loss': 0.5193, 'learning_rate': 4.554707379134861e-06, 'epoch': 1.78}


 10%|▉         | 1500/15720 [03:04<27:35,  8.59it/s]Saving model checkpoint to ./experiment/pcl/bert-base-uncased/checkpoint-1500
Configuration saved in ./experiment/pcl/bert-base-uncased/checkpoint-1500/config.json


{'loss': 0.5129, 'learning_rate': 4.522900763358779e-06, 'epoch': 1.91}


Model weights saved in ./experiment/pcl/bert-base-uncased/checkpoint-1500/pytorch_model.bin
 10%|█         | 1600/15720 [03:18<27:51,  8.45it/s]  ***** Running Evaluation *****
  Num examples = 1047
  Batch size = 12


{'loss': 0.4903, 'learning_rate': 4.491094147582698e-06, 'epoch': 2.04}


                                                    
 10%|█         | 1601/15720 [03:21<3:45:12,  1.04it/s]

{'eval_loss': 0.46055474877357483, 'eval_f1_macro': 0.7153404588042244, 'eval_runtime': 2.9533, 'eval_samples_per_second': 354.518, 'eval_steps_per_second': 29.797, 'epoch': 2.04}


 11%|█         | 1701/15720 [03:32<27:48,  8.40it/s]  

{'loss': 0.3758, 'learning_rate': 4.4592875318066155e-06, 'epoch': 2.16}


 11%|█▏        | 1801/15720 [03:43<26:42,  8.69it/s]

{'loss': 0.3422, 'learning_rate': 4.427480916030535e-06, 'epoch': 2.29}


 11%|█▏        | 1805/15720 [03:44<32:11,  7.20it/s]

In [None]:
trainer.save_model(model_path)
tokenizer.save_pretrained(tokenizer_path)

train_df.to_pickle('train_df.pickle')
val_df.to_pickle('val_df.pickle')

In [None]:
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path , config = config).to(device)


In [None]:

train_df = pd.read_pickle('train_df.pickle')
val_df = pd.read_pickle('val_df.pickle')

train_dataset = PCLDataset(tokenizer, train_df)
eval_dataset = PCLDataset(tokenizer, val_df)

In [None]:
def predict_pcl(input, tokenizer, model): 
  model.eval()
  encodings = tokenizer(input, return_tensors='pt', padding=True, truncation=True, max_length=256)
  encodings = encodings.to(device)
  output = model(**encodings)
  logits = output.logits
  preds = torch.max(logits, 1)

  return {'prediction':preds[1], 'confidence':preds[0]}

def evaluate(model, tokenizer, data_loader):

  preds = []
  tot_labels = []

  with torch.no_grad():
    for data in (data_loader): 

      labels = {}
      labels['label'] = data['label']

      tweets = data['text']

      pred = predict_pcl(tweets, tokenizer, model)

      preds.append(np.array(pred['prediction'].cpu()))
      tot_labels.append(np.array(labels['label'].cpu()))

  # with the saved predictions and labels we can compute accuracy, precision, recall and f1-score
  

  return preds, tot_labels

In [None]:
validation_loader = DataLoader(eval_dataset)

preds, tot_labels = evaluate(model, tokenizer, validation_loader)
tot_labels = np.array(tot_labels)
preds = np.array(preds)
report = classification_report(tot_labels, preds, target_names=["Not PCL","PCL"], output_dict= True)
print(report)

print(report['accuracy'])
print(report['Not PCL']['f1-score'])
print(report['PCL']['f1-score'])