In [1]:
dataset = "ecare"
model_checkpoint = "gpt2"
global_seed = 139
epochs = 3
lr = 5e-5

These notebooks are best run on Colab. Because of this, I made each notebook as independent and self-contained as possible, even if they have overlapping code. These notebooks assume a GPU is available.

To replicate:

* e-CARE:
```
global_seed = 139
epochs = 3
lr = 5e-5
```

* COPA:
```
global_seed = 139
epochs = 10
lr = 5e-5
```

### Imports and setup

In [2]:
!pip install -qqq transformers[sentencepiece] datasets evaluate
!wget https://github.com/Waste-Wood/e-CARE/files/8242580/e-CARE.zip

--2022-12-21 02:02:36--  https://github.com/Waste-Wood/e-CARE/files/8242580/e-CARE.zip
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-repository-file-5c1aeb/465962344/8242580?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221221%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221221T020236Z&X-Amz-Expires=300&X-Amz-Signature=7d9945f7f88b858ac0f131eea062a77cdf87b4be0442e00273d6495b9cf02fb2&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=465962344&response-content-disposition=attachment%3Bfilename%3De-CARE.zip&response-content-type=application%2Fzip [following]
--2022-12-21 02:02:37--  https://objects.githubusercontent.com/github-production-repository-file-5c1aeb/465962344/8242580?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221221%2Fus-eas

In [3]:
import datasets
import evaluate
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer
import transformers

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

import random
import zipfile
from dataclasses import dataclass
from typing import Optional, Union

# Some of the code is inspired by
# https://github.com/Waste-Wood/e-CARE/blob/main/code/gpt2_discriminate.py

In [4]:
metric = evaluate.load('super_glue', 'copa')
tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint)
if global_seed is not None:
    random.seed(global_seed)
    np.random.seed(global_seed)
    torch.manual_seed(global_seed)
    torch.cuda.manual_seed(global_seed)

## Function definitions

In [5]:
class GPT2Discriminate(nn.Module):
    def __init__(self, model_checkpoint):
        super(GPT2Discriminate, self).__init__()
        self.model = GPT2Model.from_pretrained(model_checkpoint)
        self.linear = nn.Linear(self.model.config.hidden_size, 1)
        self.loss_function = torch.nn.BCEWithLogitsLoss(reduction='mean')

    def forward(self, *, input_ids=None, attention_mask=None, pos=None, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        hidden_state = outputs.last_hidden_state
        pos = pos.squeeze().unsqueeze(0)
        hidden_state = hidden_state[range(hidden_state.shape[0]), pos, :].squeeze(0)
        logits = self.linear(hidden_state).squeeze(-1)
        loss = self.loss_function(logits, labels)
        results = transformers.modeling_outputs.SequenceClassifierOutputWithPast(
            loss=loss,
            logits=logits,
        )
        return results


def convert_choice(choice):
    # de-capitalizes the first character
    return choice[0].lower() + choice[1:]


def convert_premise(premise):
    # removes the full stop
    return premise.strip()[:-1]


def concat_premise_choice(datapoint, return_text_only=False):
    premise = datapoint["premise"]
    choice1 = datapoint["choice1"]
    choice2 = datapoint["choice2"]
    question = datapoint["question"]
    label = datapoint["label"]
    if question == "cause":
        causal_relation_1 = convert_premise(premise) + " because " + convert_choice(choice1)
        causal_relation_2 = convert_premise(premise) + " because " + convert_choice(choice2)
    elif question == "effect":
        causal_relation_1 = convert_premise(choice1) + " because " + convert_choice(premise)
        causal_relation_2 = convert_premise(choice2) + " because " + convert_choice(premise)
    if return_text_only:
        return causal_relation_1, causal_relation_2
    return [
        {"relation": causal_relation_1, "label": 1-label},
        {"relation": causal_relation_2, "label": label}
    ]


def tokenization(tokenizer, data):
    """Tokenizes and manually pads each causal relation
    data is a list of dicts 
    """
    inputs = []
    labels = []
    pos = []

    for example in data:
        causal_relation_1, causal_relation_2 = concat_premise_choice(example, return_text_only=True)
        inputs.extend([causal_relation_1, causal_relation_2])
        labels += [0, 1] if example['label'] == 1 else [1, 0]
    outputs = tokenizer(inputs, return_length=True)
    input_ids = outputs['input_ids']
    attention_mask = outputs['attention_mask']
    length = outputs['length']
    max_length = max(length)
    for i in range(len(input_ids)):
        gap = max_length - len(input_ids[i]) + 1
        pos.append(len(input_ids[i]))
        input_ids[i] += [50256 for _ in range(gap)]
        attention_mask[i] += [1] + [0 for _ in range(gap-1)]
    return {"input_ids": torch.LongTensor(input_ids), "attention_mask": torch.LongTensor(attention_mask), "pos": torch.LongTensor(pos), "label": torch.FloatTensor(labels)}


def compute_metrics(eval_predictions):
    """For use in hf trainer
    eval_predictions is a namedtuple of numpy arrays
    containing results over the whole dev set
    """
    predictions, labels = eval_predictions

    # output vector of model is 1d
    a1 = torch.FloatTensor(predictions[::2]).unsqueeze(1)
    a2 = torch.FloatTensor(predictions[1::2]).unsqueeze(1)
    a = torch.cat((a1, a2), dim=1)
    predict_labels = torch.argmax(a, 1).tolist()

    t_a1 = torch.FloatTensor(labels[::2]).unsqueeze(1)
    t_a2 = torch.FloatTensor(labels[1::2]).unsqueeze(1)
    t_a = torch.cat((t_a1, t_a2), dim=1)
    true_labels = torch.argmax(t_a, 1).tolist()

    accuracy = metric.compute(predictions=predict_labels, references=true_labels)
    return accuracy


def process_choice(choice):
    """Used in evaluate_model
    choice is a dict of lists (not tensors for some reason)
    """
    input_ids = torch.LongTensor(choice["input_ids"]).cuda().unsqueeze(0)
    attention_mask = torch.LongTensor(choice["attention_mask"]).cuda().unsqueeze(0)
    pos = torch.LongTensor([choice["pos"]]).cuda().unsqueeze(0)
    labels = torch.FloatTensor([choice["label"]]).cuda().squeeze()
    return {"input_ids": input_ids, "attention_mask": attention_mask, "pos": pos, "labels": labels}


@torch.no_grad()
def evaluate_model(model, tokenized_data):
    y_pred = []
    y_true = []
    for i in range(0, tokenized_data.num_rows, 2):
        choice1 = tokenized_data[i]
        choice2 = tokenized_data[i+1]
        pred1 = model(**process_choice(choice1))
        pred2 = model(**process_choice(choice2))
        pred1 = pred1.logits.item()
        pred2 = pred2.logits.item()
        y_pred.append(torch.argmax(torch.FloatTensor([pred1, pred2])).item())
        y_true.append(choice2["label"])
    return classification_report(y_true, y_pred), accuracy_score(y_true, y_pred)

## Run

In [6]:
if dataset == "copa":
    copa = datasets.load_dataset("super_glue", "copa")
    # convert to hf Dataset object
    tokenized_train = datasets.Dataset.from_dict(
        tokenization(tokenizer, copa["train"])
    )
    tokenized_dev = datasets.Dataset.from_dict(
        tokenization(tokenizer, copa["validation"])
    )

elif dataset == "ecare":
    with zipfile.ZipFile("e-CARE.zip") as z:
        with z.open("dataset/train_full.jsonl") as f:
            train_df = pd.read_json(f, lines=True)
        with z.open("dataset/dev_full.jsonl") as f:
            dev_df = pd.read_json(f, lines=True)

    # rename columns same as copa
    rel2fields = {"ask-for": "question", "hypothesis1": "choice1", "hypothesis2": "choice2", "index": "idx"}
    train_df.rename(rel2fields, axis=1, inplace=True)
    dev_df.rename(rel2fields, axis=1, inplace=True)

    # convert to hf Dataset object
    tokenized_train = datasets.Dataset.from_dict(
        tokenization(tokenizer, train_df.to_dict("records"))
    )
    tokenized_dev = datasets.Dataset.from_dict(
        tokenization(tokenizer, dev_df.to_dict("records"))
    )

In [7]:
model = GPT2Discriminate(model_checkpoint)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=lr,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=epochs,
    weight_decay=1e-2,
    # fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    compute_metrics=compute_metrics,
)

trainer.train()

***** Running training *****
  Num examples = 29856
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2799
  Number of trainable parameters = 124440577


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7048,0.694156,0.615928
2,0.6775,0.664518,0.65787
3,0.6462,0.650831,0.669651


***** Running Evaluation *****
  Num examples = 4244
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-933
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 4244
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1866
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 4244
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-2799
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-2799 (score: 0.6696512723845429).


TrainOutput(global_step=2799, training_loss=0.6729283579846458, metrics={'train_runtime': 1319.2056, 'train_samples_per_second': 67.895, 'train_steps_per_second': 2.122, 'total_flos': 0.0, 'train_loss': 0.6729283579846458, 'epoch': 3.0})

In [8]:
classif_report, acc = evaluate_model(model, tokenized_dev)
print(classif_report)
print("Accuracy: {}".format(acc))

              precision    recall  f1-score   support

         0.0       0.67      0.66      0.67      1061
         1.0       0.67      0.68      0.67      1061

    accuracy                           0.67      2122
   macro avg       0.67      0.67      0.67      2122
weighted avg       0.67      0.67      0.67      2122

Accuracy: 0.6696512723845429
