In [None]:
dataset = "ecare"
# model_checkpoint = "microsoft/deberta-v3-base"
model_checkpoint = "bert-base-cased"
global_seed = 139
epochs = 3
lr = 5e-5

These notebooks are best run on Colab. Because of this, I made each notebook as independent and self-contained as possible, even if they have overlapping code. These notebooks assume a GPU is available.

To replicate:

* BERT COPA:
```
dataset = "copa"
model_checkpoint = "bert-base-cased"
global_seed = 139
epochs = 10
lr = 5e-5
```

* BERT e-CARE:
```
dataset = "ecare"
model_checkpoint = "bert-base-cased"
global_seed = 139
epochs = 3
lr = 5e-5
```

* DeBERTa-v3 COPA:
```
dataset = "copa"
model_checkpoint = "microsoft/deberta-v3-base"
global_seed = 139
epochs = 10
lr = 5e-5
```

* DeBERTa-v3 e-CARE:
```
dataset = "ecare"
model_checkpoint = "microsoft/deberta-v3-base"
global_seed = 139
epochs = 3
lr = 5e-5
```

### Imports and setup

In [None]:
!pip install -qqq transformers[sentencepiece] datasets evaluate
!wget https://github.com/Waste-Wood/e-CARE/files/8242580/e-CARE.zip

--2022-12-21 01:46:36--  https://github.com/Waste-Wood/e-CARE/files/8242580/e-CARE.zip
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-repository-file-5c1aeb/465962344/8242580?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221221%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221221T014636Z&X-Amz-Expires=300&X-Amz-Signature=ba839561d9e00c931d8a54b76cbc015393b579c57c1e11a3f614a76c6ad82b32&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=465962344&response-content-disposition=attachment%3Bfilename%3De-CARE.zip&response-content-type=application%2Fzip [following]
--2022-12-21 01:46:36--  https://objects.githubusercontent.com/github-production-repository-file-5c1aeb/465962344/8242580?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221221%2Fus-east-1%

In [None]:
import datasets
import evaluate
from transformers import Trainer, AutoModelForSequenceClassification, TrainingArguments
from transformers import AutoTokenizer
import transformers

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

import random
import zipfile
from dataclasses import dataclass
from typing import Optional, Union

In [None]:
metric = evaluate.load('super_glue', 'copa')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
if global_seed is not None:
    random.seed(global_seed)
    np.random.seed(global_seed)
    torch.manual_seed(global_seed)
    torch.cuda.manual_seed(global_seed)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

## Function definitions

In [None]:
def convert_choice(choice):
    """De-capitalizes the first character of the sentence"""
    return choice[0].lower() + choice[1:]


def convert_premise(premise):
    """Removes the full-stop at the end of the sentence"""
    return premise.strip()[:-1]


def concat_premise_choice(datapoint, return_text_only=False):
    """For each example (premise, choice1, choice2, label)
    split into two datapoints: (premise + choice1, 1-label) and (premise + choice2, label)"""

    premise = datapoint["premise"]
    choice1 = datapoint["choice1"]
    choice2 = datapoint["choice2"]
    question = datapoint["question"]
    label = datapoint["label"]
    # changes the premise-choice concatenation order based on question (cause or effect)
    # only use 'because' connector to simplify the model inputs
    if question == "cause":
        causal_relation_1 = convert_premise(premise) + " because " + convert_choice(choice1)
        causal_relation_2 = convert_premise(premise) + " because " + convert_choice(choice2)
    elif question == "effect":
        causal_relation_1 = convert_premise(choice1) + " because " + convert_choice(premise)
        causal_relation_2 = convert_premise(choice2) + " because " + convert_choice(premise)
    if return_text_only:
        return causal_relation_1, causal_relation_2
    return [
        {"relation": causal_relation_1, "label": 1-label},
        {"relation": causal_relation_2, "label": label}
    ]


def preprocess(examples):
    return tokenizer(examples["relation"], truncation=True, padding=True)


def compute_metrics(eval_predictions):
    """For use in huggingface Trainer
    eval_predictions is a namedtuple of numpy arrays
    containing logits over the whole dev set
    """
    predictions, labels = eval_predictions

    # output vector of model is 2d, we want logits of class 1
    a1 = torch.FloatTensor(predictions[::2, 1]).unsqueeze(1)
    a2 = torch.FloatTensor(predictions[1::2, 1]).unsqueeze(1)
    a = torch.cat((a1, a2), dim=1)
    predict_labels = torch.argmax(a, 1).tolist()

    t_a1 = torch.FloatTensor(labels[::2]).unsqueeze(1)
    t_a2 = torch.FloatTensor(labels[1::2]).unsqueeze(1)
    t_a = torch.cat((t_a1, t_a2), dim=1)
    true_labels = torch.argmax(t_a, 1).tolist()

    accuracy = metric.compute(predictions=predict_labels, references=true_labels)
    return accuracy


def get_copa_relations(datset):
    """Gets all the causal relations and flattens the resulting list of dicts
    Input is a Dataset object"""
    relations = []
    for i in range(datset.num_rows):
        relations.append(concat_premise_choice(datset[i]))
    return sum(relations, [])


def get_ecare_relations(df):
    """Gets all the causal relations and flattens the resulting list of dicts
    Input is a dataframe"""
    relations = []
    for i in range(len(df)):
        relations.append(concat_premise_choice(df.iloc[i, :]))
    return sum(relations, [])


def process_choice(choice):
    """Used in evaluate_model
    choice is a dict of lists
    """
    input_ids = torch.LongTensor(choice["input_ids"]).unsqueeze(0).cuda()
    attention_mask = torch.LongTensor(choice["attention_mask"]).unsqueeze(0).cuda()
    return {"input_ids": input_ids, "attention_mask": attention_mask}


@torch.no_grad()
def evaluate_model(model, tokenized_data):
    """Loops through each datapoint, get the predicted label,
    and compute accuracy and classification report table
    """
    y_pred = []
    y_true = []
    for i in range(0, tokenized_data.num_rows, 2):
        choice1 = tokenized_data[i]
        choice2 = tokenized_data[i+1]
        pred1 = model(**process_choice(choice1))
        pred2 = model(**process_choice(choice2))
        pred1 = pred1.logits[:,1].item()
        pred2 = pred2.logits[:,1].item()
        y_pred.append(torch.argmax(torch.FloatTensor([pred1, pred2])).item())
        y_true.append(choice2["label"])
    return classification_report(y_true, y_pred), accuracy_score(y_true, y_pred)

## Run

In [None]:
if dataset == "copa":
    copa = datasets.load_dataset("super_glue", "copa")
    # convert to huggingface Dataset object
    train_data = datasets.Dataset.from_dict(
        pd.DataFrame(get_copa_relations(copa["train"])).to_dict(orient="list")
    )
    dev_data = datasets.Dataset.from_dict(
        pd.DataFrame(get_copa_relations(copa["validation"])).to_dict(orient="list")
    )

elif dataset == "ecare":
    with zipfile.ZipFile("e-CARE.zip") as z:
        with z.open("dataset/train_full.jsonl") as f:
            train_df = pd.read_json(f, lines=True)
        with z.open("dataset/dev_full.jsonl") as f:
            dev_df = pd.read_json(f, lines=True)

    # Rename columns to be the same as copa
    rel2fields = {"ask-for": "question", "hypothesis1": "choice1", "hypothesis2": "choice2", "index": "idx"}
    train_df.rename(rel2fields, axis=1, inplace=True)
    dev_df.rename(rel2fields, axis=1, inplace=True)

    # convert to huggingface Dataset object
    train_data = datasets.Dataset.from_dict(
        pd.DataFrame(get_ecare_relations(train_df)).to_dict(orient="list")
    )
    dev_data = datasets.Dataset.from_dict(
        pd.DataFrame(get_ecare_relations(dev_df)).to_dict(orient="list")
    )

tokenized_train = train_data.map(preprocess, batched=True, remove_columns=["relation"])
tokenized_dev = dev_data.map(preprocess, batched=True, remove_columns=["relation"])

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=lr,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=epochs,
    weight_decay=1e-2,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=tokenizer, # don't remove this line
    compute_metrics=compute_metrics,
)

trainer.train()

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6863,0.631327,0.696041
2,0.581,0.625981,0.725259
3,0.389,0.712103,0.754948


***** Running Evaluation *****
  Num examples = 4244
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-933
Configuration saved in ./results/checkpoint-933/config.json
Model weights saved in ./results/checkpoint-933/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-933/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-933/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4244
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1866
Configuration saved in ./results/checkpoint-1866/config.json
Model weights saved in ./results/checkpoint-1866/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1866/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1866/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4244
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-2799
Configuration saved in ./results/checkpoint-2799/config.json


TrainOutput(global_step=2799, training_loss=0.5466485658940693, metrics={'train_runtime': 1341.9028, 'train_samples_per_second': 66.747, 'train_steps_per_second': 2.086, 'total_flos': 3539041027883520.0, 'train_loss': 0.5466485658940693, 'epoch': 3.0})

In [None]:
classif_report, acc = evaluate_model(model, tokenized_dev)
print(classif_report)
print("Accuracy: {}".format(acc))

              precision    recall  f1-score   support

           0       0.76      0.74      0.75      1061
           1       0.75      0.77      0.76      1061

    accuracy                           0.75      2122
   macro avg       0.76      0.75      0.75      2122
weighted avg       0.76      0.75      0.75      2122

Accuracy: 0.7549481621112158
