## Imports

In [9]:
import sys
import random
from pathlib import Path
from collections import Counter

import datasets
datasets.logging.set_verbosity_error() # Disable the logging of the datasets library

from fastchat.model import get_conversation_template

# Load the dataset

In [3]:
VERSION = "v1"

In [4]:
!ls -lah datasets | grep {VERSION} | grep qnli

drwxr-xr-x  3 augustas Domain Users  25K Jul 21 22:46 qnli_vicuna_ppo_training_raw_v1
drwxr-xr-x  3 augustas Domain Users  33K Jul 21 22:45 qnli_vicuna_train_raw_v1
-rw-r--r--  1 augustas Domain Users 2.0M Jul 21 23:02 qnli_vicuna_train_v1.parquet


In [5]:
# Set dataset_dict to test_dataset
dataset_dict = datasets.DatasetDict.load_from_disk(
    f"datasets/qnli_vicuna_ppo_training_raw_{VERSION}"
)
dataset_dict

DatasetDict({
    glue/qnli: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 94743
    })
})

In [6]:
sum(dataset.num_rows for dataset in dataset_dict.values())

94743

In [7]:
for dataset_name, dataset in dataset_dict.items():
    print(f"{dataset_name}: {len(Counter(dataset['label']))}")

glue/qnli: 2


## The template

In [10]:
def format_label(label):
    return "No" if label else "Yes"

# Dataset template
def template(example, answer_prefix="Answer:"):
    conv = get_conversation_template("lmsys/vicuna-7b-v1.3")

    message = (
        "Consider the sentence below in triple backticks "
        "and corresponding question. Does the sentence contain enough information "
        "to answer the question? Your answer should be either yes or no.\n\n"
        "Desired format:\n"
        "Answer: <your_answer>\n"
        f"Do not print \"{answer_prefix}\" again, just what you think the answer is.\n\n"
        f"Sentence:\n```\n{example['sentence']}\n```\n"
        f"Question: {example['question']}?\n"
        f"{answer_prefix}"
    )

    conv.append_message(conv.roles[0], message)
    conv.append_message(conv.roles[1], None)

    return conv.get_prompt(), format_label(example["label"])


q, a = template(dataset[2])
print(" ".join([q, a]))

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Consider the sentence below in triple backticks and corresponding question. Does the sentence contain enough information to answer the question? Your answer should be either yes or no.

Desired format:
Answer: <your_answer>
Do not print "Answer:" again, just what you think the answer is.

Sentence:
```
The Nippon Hōsō Kyōkai (NHK, the Japan Broadcasting Corporation) began conducting research to "unlock the fundamental mechanism of video and sound interactions with the five human senses" in 1964, after the Tokyo Olympics.
```
Question: What was the aspect ratio of the NHK Color??
Answer: ASSISTANT: No


In [15]:
%%time

# Reproducibility
random.seed(2023)

ALLOWED_KEYS = ["prompt", "best_response", "original_dataset"]

new_dataset = []

for dataset_name, dataset in dataset_dict.items():
    print(dataset_name)

    for idx, entry in enumerate(dataset):
        new_entry = entry.copy()
        
        # In case we need to know which dataset the entry came from
        new_entry["original_dataset"] = dataset_name

        q, a = template(new_entry)
        new_entry["prompt"] = q

        # We can now change the label to whether the sample is truthful or not
        new_entry["best_response"] = a.strip()

        # Remove all other keys
        new_entry = { k: v for k, v in new_entry.items() if k in ALLOWED_KEYS }

        # Append to the new dataset
        new_dataset.append(new_entry)

glue/qnli


CPU times: user 3.88 s, sys: 69.2 ms, total: 3.95 s
Wall time: 3.95 s


In [16]:
my_dataset = datasets.Dataset.from_list(new_dataset)
my_dataset

Dataset({
    features: ['original_dataset', 'prompt', 'best_response'],
    num_rows: 94743
})

In [17]:
current_idx = 0
my_dataset[current_idx]

{'original_dataset': 'glue/qnli',
 'prompt': 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions. USER: Consider the sentence below in triple backticks and corresponding question. Does the sentence contain enough information to answer the question? Your answer should be either yes or no.\n\nDesired format:\nAnswer: <your_answer>\nDo not print "Answer:" again, just what you think the answer is.\n\nSentence:\n```\nNondenominational, evangelical, independent and other churches are on the rise, and constitute a significant part of Protestant Christianity.\n```\nQuestion: What percentage of Christians are Protestant??\nAnswer: ASSISTANT:',
 'best_response': 'No'}

In [21]:
prompt = my_dataset[current_idx]["prompt"]
best_response = my_dataset[current_idx]["best_response"]

print(prompt + " " + best_response)

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Consider the sentence below in triple backticks and corresponding question. Does the sentence contain enough information to answer the question? Your answer should be either yes or no.

Desired format:
Answer: <your_answer>
Do not print "Answer:" again, just what you think the answer is.

Sentence:
```
Nondenominational, evangelical, independent and other churches are on the rise, and constitute a significant part of Protestant Christianity.
```
Question: What percentage of Christians are Protestant??
Answer: ASSISTANT: No


In [18]:
for current_index in range(10, 20):
    print(f"Index: {current_index}")
    print(f"Prompt:\n{my_dataset[current_index]['prompt']}'")
    print(f"Best response: {my_dataset[current_index]['best_response']}")
    print("-" * 50)

# print(f'\'{my_dataset[current_idx]["prompt"]}\'')

Index: 10
Prompt:
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Consider the sentence below in triple backticks and corresponding question. Does the sentence contain enough information to answer the question? Your answer should be either yes or no.

Desired format:
Answer: <your_answer>
Do not print "Answer:" again, just what you think the answer is.

Sentence:
```
Whitehead was apparently not particularly close with his mother, as he never mentioned her in any of his writings, and there is evidence that Whitehead's wife, Evelyn, had a low opinion of her.
```
Question: What year was Whitehead born??
Answer: ASSISTANT:'
Best response: No
--------------------------------------------------
Index: 11
Prompt:
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Consider t

In [22]:
Counter(my_dataset["original_dataset"])

Counter({'glue/qnli': 94743})

In [25]:
# my_dataset.to_parquet(f"datasets/qnli_vicuna_ppo_training_{VERSION}.parquet")

In [26]:
!ls -lah datasets | grep {VERSION} | grep qnli

drwxr-xr-x  3 augustas Domain Users  25K Jul 21 22:46 qnli_vicuna_ppo_training_raw_v1
-rw-r--r--  1 augustas Domain Users  19M Jul 21 23:18 qnli_vicuna_ppo_training_v1.parquet
drwxr-xr-x  3 augustas Domain Users  33K Jul 21 22:45 qnli_vicuna_train_raw_v1
-rw-r--r--  1 augustas Domain Users 2.0M Jul 21 23:02 qnli_vicuna_train_v1.parquet
