# Commonsense Causal Reasoning Classification Using Prompts

In this notebook, we did a prompt-based CCR classification experiment.


In [2]:
import numpy as np
from datasets import load_dataset, load_metric
import time
import evaluate
from sklearn.metrics import classification_report

In [13]:
import os
import openai

#For different users, you need to replace the api_key to your own in api_key.txt
openai.api_key_path="api_key.txt"
#Or directly use api_key below
#openai.api_key = "your_own_api_key"

## Load COPA Dataset

In [5]:
copa = load_dataset("super_glue", "copa")

Found cached dataset super_glue (C:/Users/gazer/.cache/huggingface/datasets/super_glue/copa/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed)
100%|██████████| 3/3 [00:00<00:00, 752.21it/s]


In [6]:
# See one example

copa["train"][0]

{'premise': 'My body cast a shadow over the grass.',
 'choice1': 'The sun was rising.',
 'choice2': 'The grass was cut.',
 'question': 'cause',
 'idx': 0,
 'label': 0}

In [7]:
# Data size
print(len(copa["train"]))
print(len(copa["validation"]))
print(len(copa["test"]))

400
100
500


## Baseline

With a given datapoint $D$ to test, take the first one in traing set above as an example, 
our baseline prompt setting is like:

> Identify the correct response from two sentences.
>
> Primese: My body cast a shadow over the grass.
> 
> Choice1: The sun was rising.
> 
> Choice2: The grass was cut.
> 
> Question: cause
> 
> Answer:
  
This zero-shot learning prompt is directly sent to openai api to get a result.

In [26]:
def generate_prompt_base(example):
    return """ Identify the correct response from two sentences.
    
    Premise: {}
    Choice1: {}
    Choice2: {}
    Question: {}
    Answer:
    """.format(
        example['premise'],
        example['choice1'],
        example['choice2'],
        example['question'],
    )  

In [27]:
def index(example):
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=generate_prompt_base(example),
        temperature=0.7,
        max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response

In [16]:
def generate_pred_label(choices, response):
    ans = response.choices[0].text
    ans = ans.replace("Choice1:", "")
    ans = ans.replace("Choice2:", "")
    ans = ans.strip()
    
    if (ans==choices[0]):
        return 0, ans
    elif (ans==choices[1]):
        return 1, ans
    else: 
        return 1, ans

In [17]:
def test(examples, sleep_time=0.5):
    labels = np.array(examples['label'])
    premises = examples['premise']
    choice1s = examples['choice1']
    choice2s = examples['choice2']
    questions = examples['question']
    y_pred = np.zeros(len(labels))
    
    anss = []
    responses = []
    for i in range(len(labels)):
        example = { 
            'premise': premises[i],
            'choice1': choice1s[i],
            'choice2':choice2s[i],
            'question': questions[i],
        }
        res = index(example)
        choices = [choice1s[i], choice2s[i]]
        y_pred[i], ans = generate_pred_label(choices, res)
        time.sleep(sleep_time)
        responses.append(res)
        anss.append(ans)
    return y_pred, labels, responses, anss



### Example output 

In [28]:
y_pred, y_true, responses, anss= test(copa['validation'][:5])

In [31]:
def show_example_response(r, example, ans):
    print("Sample details:\n", example)
    print("Expected Answer: ", r.choices[0].text)
    print("Answer :", ans)

In [30]:
for i in range(5):
    show_example_response(responses[i], copa['validation'][i], anss[i])
    print('\n\n')

Sample details
 {'premise': 'The man turned on the faucet.', 'choice1': 'The toilet filled with water.', 'choice2': 'Water flowed from the spout.', 'question': 'effect', 'idx': 0, 'label': 1}
Expected Answer 
Choice2: Water flowed from the spout.
Answer : Water flowed from the spout.



Sample details
 {'premise': 'The girl found a bug in her cereal.', 'choice1': 'She poured milk in the bowl.', 'choice2': 'She lost her appetite.', 'question': 'effect', 'idx': 1, 'label': 1}
Expected Answer  She lost her appetite.
Answer : She lost her appetite.



Sample details
 {'premise': 'The woman retired.', 'choice1': 'She received her pension.', 'choice2': 'She paid off her mortgage.', 'question': 'effect', 'idx': 2, 'label': 0}
Expected Answer  She received her pension.
Answer : She received her pension.



Sample details
 {'premise': 'I wanted to conserve energy.', 'choice1': 'I swept the floor in the unoccupied room.', 'choice2': 'I shut off the light in the unoccupied room.', 'question': 'ef

In [25]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 2.11MB/s]
Downloading builder script: 100%|██████████| 7.55k/7.55k [00:00<00:00, 2.52MB/s]
Downloading builder script: 100%|██████████| 7.36k/7.36k [00:00<00:00, 3.69MB/s]
Downloading builder script: 100%|██████████| 6.77k/6.77k [00:00<00:00, 3.40MB/s]


### baseline on validation set 

In [38]:
a, p, r, f = [], [], [], []
for run_time in range(5):
    val_pred = np.zeros(len(copa['validation']))
    val_true = np.zeros(len(copa['validation']))
    val_responses = []
    val_ans = []
    for i in range(5):
        st = i*20
        ed = (i+1)*20
        pred, true, responses, anss= test(copa['validation'][st:ed], sleep_time = 3)
        # time.sleep(2)
        val_pred[st:ed] = pred
        val_true[st:ed] = true
        val_responses.append(responses)
        val_ans.append(anss)
    a.append(accuracy.compute(predictions = val_pred, references = val_true)["accuracy"])
    p.append(precision.compute(predictions = val_pred, references = val_true, average="macro")["precision"])
    r.append(recall.compute(predictions = val_pred, references = val_true, average="macro")["recall"])
    f.append(f1.compute(predictions = val_pred, references = val_true, average="macro")["f1"])

In [13]:
print(classification_report(val_true, val_pred))

              precision    recall  f1-score   support

         0.0       0.90      0.96      0.93        55
         1.0       0.95      0.87      0.91        45

    accuracy                           0.92       100
   macro avg       0.92      0.92      0.92       100
weighted avg       0.92      0.92      0.92       100



In [39]:
print(np.array(a).mean(), np.array(a).std())
print(np.array(p).mean(), np.array(p).std())
print(np.array(r).mean(), np.array(r).std())
print(np.array(f).mean(), np.array(f).std())

0.9200000000000002 0.006324555320336764
0.9239870999361243 0.006559787750021689
0.9155555555555555 0.006407576374607563
0.9184902950437671 0.006434214461826098
