# Commonsense Causal Reasoning

In [1]:
import os
import wandb
import torch
import numpy as np
from dataclasses import dataclass
from typing import Optional, Union
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

## Load COPA Dataset

In [2]:
copa = load_dataset("super_glue", "copa")

Found cached dataset super_glue (/root/.cache/huggingface/datasets/super_glue/copa/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# See one example

copa["train"][0]

{'premise': 'My body cast a shadow over the grass.',
 'choice1': 'The sun was rising.',
 'choice2': 'The grass was cut.',
 'question': 'cause',
 'idx': 0,
 'label': 0}

In [4]:
# Data size
print(len(copa["train"]))
print(len(copa["validation"]))
print(len(copa["test"]))

400
100
500


## Preprocessing

See [Multiple choice](https://huggingface.co/docs/transformers/tasks/multiple_choice).

Here, we use `AutoModelForMultipleChoice` for the baseline. The model receives one input sentence as question and several sentences as candidates. Then the model predicts the correct answer sentence by text classification. Here we use `premise` with `question` as query and choice_i as candidates.

Example 1:

```python
{'premise': 'My body cast a shadow over the grass.',
 'choice1': 'The sun was rising.',
 'choice2': 'The grass was cut.',
 'question': 'cause',
 'idx': 0,
 'label': 0}
```

- `query`: my body cast a shadow over the grass because
- `candidates1`: the sun was rising.
- `candidates2`: the grass was cut.

---

Example 2:

```python
{'premise': 'The elderly woman suffered a stroke.',
 'choice1': "The woman's daughter came over to clean her house.",
 'choice2': "The woman's daughter moved in to take care of her.",
 'question': 'effect',
 'idx': 11,
 'label': 1}
```

- `query`: the elderly woman suffered a stroke so
- `candidates1`: the woman's daughter came over to clean her house.
- `candidates2`: the woman's daughter moved in to take care of her.

In [5]:
def preprocess_function(examples):
    question_headers = examples["question"]
    first_sentences = [[context]*2 for context in examples["premise"]]
    first_sentences = [
        [f"{examples['premise'][i][:-1]} because"]*2 if header == "cause" else\
        [f"{examples['premise'][i][:-1]} so"]*2\
            for i, header in enumerate(question_headers)
    ]
    first_sentences = sum(first_sentences, [])
    
    second_sentences = [
        [examples[end][i] for end in ["choice1", "choice2"]] for i, header in enumerate(question_headers)
    ]
    second_sentences = sum(second_sentences, [])
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    return {k: [v[i : i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}

In [6]:
tokenized_copa = copa.map(preprocess_function, batched=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/super_glue/copa/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-3bbe2bef9bd51d6d.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/super_glue/copa/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-f96e948ad1ff9d46.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/super_glue/copa/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-8894e2f611330f6f.arrow


In [7]:
@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

## Compute Metrics

In [8]:
import evaluate

accuracy = evaluate.load("accuracy")

In [9]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Fine-tuning and Evaluation

In [10]:
model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [11]:
from transformers.trainer_callback import PrinterCallback

In [12]:
# login to the wandb account
wandb_api_key = "c7b439ada6f98ce983aadb92c78a052fc661d3ba"
os.system("wandb login {}".format(wandb_api_key))
wandb.init(project="machine-learning-copa", entity="yiyang-feng")
wandb.run.name = "bert-50eps-16bsz-copa"

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_copa["train"],
    eval_dataset=tokenized_copa["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics = compute_metrics
)

trainer.remove_callback(PrinterCallback)

trainer.train()

wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mwind-like[0m ([33myiyang-feng[0m). Use [1m`wandb login --relogin`[0m to force relogin


The following columns in the training set don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: choice1, idx, question, choice2, premise. If choice1, idx, question, choice2, premise are not expected by `BertForMultipleChoice.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 400
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 350
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.692103,0.54
2,No log,0.688388,0.58
3,No log,0.684051,0.55
4,No log,0.68327,0.54
5,No log,0.708129,0.57
6,No log,0.748575,0.57
7,No log,0.795999,0.55
8,No log,0.987042,0.59
9,No log,0.886384,0.62
10,No log,1.001838,0.59


The following columns in the evaluation set don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: choice1, idx, question, choice2, premise. If choice1, idx, question, choice2, premise are not expected by `BertForMultipleChoice.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: choice1, idx, question, choice2, premise. If choice1, idx, question, choice2, premise are not expected by `BertForMultipleChoice.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: choice1, idx, question, choice2, premise. If choice1, idx, question, choice2, premise are not 

TrainOutput(global_step=350, training_loss=0.096861572265625, metrics={'train_runtime': 127.0146, 'train_samples_per_second': 157.462, 'train_steps_per_second': 2.756, 'total_flos': 515380596691392.0, 'train_loss': 0.096861572265625, 'epoch': 50.0})

## Test Performance



The test set has fake labels `-1`. I cannot use `trainer.predict()`.

In [13]:
predictions = []

for idx, example in enumerate(tokenized_copa["test"]):
    if example["question"] == "cause":
        prompt = example["premise"][:-1] + " because"
    elif example["question"] == "effect":
        prompt = example["premise"][:-1] + " so"
    choice1 = example["choice1"]
    choice2 = example["choice2"]
    inputs = tokenizer([[prompt, choice1], [prompt, choice2]], return_tensors="pt", padding=True)
    outputs = model(**{k: v.unsqueeze(0).to("cuda") for k, v in inputs.items()})
    logits = outputs.logits
    predicted_class = logits.argmax().item()
    predictions.append(predicted_class)

output_predict_file = os.path.join("./", "predictions.txt")
if trainer.is_world_process_zero():
    with open(output_predict_file, "w") as writer:
        writer.write("index\tprediction\n")
        for index, item in enumerate(predictions):
            item = predictions[item]
            writer.write(f"{index}\t{item}\n")