## Imports

In [11]:
import random
from collections import Counter

import datasets
datasets.logging.set_verbosity_error() # Disable the logging of the datasets library
from datasets import load_dataset_builder, load_dataset

from fastchat.model import get_conversation_template

## Config

In [2]:
BURNS_DATASETS = ["glue:qnli"]

VERSION = f"v1"

SPLIT = "validation"

N_PER_DATASET = 25000 if SPLIT == "train" else 60000

SEED = 42 if SPLIT == "train" else 2023
SEED

2023

## Load and inspect the datasets

In [5]:
dataset_dict = {}
for dataset_path in BURNS_DATASETS:
    print(dataset_path)

    # Parse dataset name
    dataset_name = None    
    if ":" in dataset_path:
        dataset_path, dataset_name = dataset_path.split(":")
    
    
    # Get the most validation-like split
    available_splits = load_dataset_builder(
        dataset_path, name=dataset_name
    ).info.splits.keys()
    split = "validation" if "validation" in available_splits else "test"
    split = split if SPLIT != "train" else "train"
    print(split)

    # Load the dataset
    dataset = load_dataset(
        dataset_path, name=dataset_name, split=split,
    )

    # Get a desired subset of the data
    n = N_PER_DATASET if dataset.num_rows > N_PER_DATASET else dataset.num_rows
    dataset = dataset.shuffle(seed=SEED).select(range(n))

    key = f"{dataset_path}/{dataset_name}" if dataset_name else dataset_path
    dataset_dict[key] = dataset

    print("-----------------------------------")

glue:qnli


validation
-----------------------------------


In [6]:
sum(dataset.num_rows for dataset in dataset_dict.values())

5463

In [7]:
assert all([len(Counter(dataset["label"])) > 1 for dataset in dataset_dict.values()])

In [8]:
for dataset_name, dataset in dataset_dict.items():
    print(f"{dataset_name}: {dataset.num_rows}")

glue/qnli: 5463


In [9]:
for dataset_name, dataset in dataset_dict.items():
    print(f"{dataset_name}: {len(Counter(dataset['label']))}")

glue/qnli: 2


## Get the templates

In [12]:
def format_label(label):
    return "No" if label else "Yes"

# Dataset template
def template(example, answer_prefix="Answer:"):
    conv = get_conversation_template("lmsys/vicuna-7b-v1.3")

    message = (
        "Consider the sentence below in triple backticks "
        "and corresponding question. Does the sentence contain enough information "
        "to answer the question? Your answer should be either yes or no.\n\n"
        "Desired format:\n"
        "Answer: <your_answer>\n"
        f"Do not print \"{answer_prefix}\" again, just what you think the answer is.\n\n"
        f"Sentence:\n```\n{example['sentence']}\n```\n"
        f"Question: {example['question']}?\n"
        f"{answer_prefix}"
    )

    conv.append_message(conv.roles[0], message)
    conv.append_message(conv.roles[1], None)

    return conv.get_prompt(), format_label(example["label"])


q, a = template(dataset[2])
print(" ".join([q, a]))

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Consider the sentence below in triple backticks and corresponding question. Does the sentence contain enough information to answer the question? Your answer should be either yes or no.

Desired format:
Answer: <your_answer>
Do not print "Answer:" again, just what you think the answer is.

Sentence:
```
The receivers, though designed and built by different manufacturers, must conform to the same user interface look-and-feel as all the others.
```
Question: Who is the parent company of NDS??
Answer: ASSISTANT: No


## Form the dataset for the chosen split

In [15]:
%%time

# Reproducibility
random.seed(SEED)

ALLOWED_KEYS = ["text", "label", "original_dataset"]

new_dataset = []

for dataset_name, dataset in dataset_dict.items():
    print(dataset_name)

    for idx, entry in enumerate(dataset):
        new_entry = entry.copy()
        
        # In case we need to know which dataset the entry came from
        new_entry["original_dataset"] = dataset_name

        # Whether the sample will be truthful or not
        is_truthful = random.choice([True, False])

        # Apply the template
        if not is_truthful:
            # Untruthful binary case
            new_entry["label"] = 1 - new_entry["label"]
        new_text = " ".join(template(new_entry))

        new_entry["text"] = new_text

        # We can now change the label to whether the sample is truthful or not
        new_entry["label"] = int(is_truthful)

        # Remove all other keys
        new_entry = { k: v for k, v in new_entry.items() if k in ALLOWED_KEYS }

        # Append to the new dataset
        new_dataset.append(new_entry)

glue/qnli
CPU times: user 441 ms, sys: 24.6 ms, total: 465 ms
Wall time: 526 ms


In [16]:
my_dataset = datasets.Dataset.from_list(new_dataset)
my_dataset

Dataset({
    features: ['label', 'original_dataset', 'text'],
    num_rows: 5463
})

In [17]:
current_idx = 0
my_dataset[current_idx]

{'label': 0,
 'original_dataset': 'glue/qnli',
 'text': 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions. USER: Consider the sentence below in triple backticks and corresponding question. Does the sentence contain enough information to answer the question? Your answer should be either yes or no.\n\nDesired format:\nAnswer: <your_answer>\nDo not print "Answer:" again, just what you think the answer is.\n\nSentence:\n```\nHowever, even in this case, BSkyB does not carry any control over the channel\'s content or carriage issues such as picture quality.\n```\nQuestion: Does BSkyB carry any control over a channels content??\nAnswer: ASSISTANT: No'}

In [19]:
for current_index in range(10, 20):
    print(f"label={my_dataset[current_index]['label']}")
    print(my_dataset[current_index]["text"])
    print("---------------------------------")

label=1
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Consider the sentence below in triple backticks and corresponding question. Does the sentence contain enough information to answer the question? Your answer should be either yes or no.

Desired format:
Answer: <your_answer>
Do not print "Answer:" again, just what you think the answer is.

Sentence:
```
A platted town was established there in 1822, a year after the United States gained Florida from Spain; it was named after Andrew Jackson, the first military governor of the Florida Territory and seventh President of the United States.
```
Question: What is the name of the French colony established in 1564??
Answer: ASSISTANT: No
---------------------------------
label=1
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions

In [20]:
Counter(my_dataset["label"]), Counter(my_dataset["original_dataset"])

(Counter({0: 2795, 1: 2668}), Counter({'glue/qnli': 5463}))

In [25]:
# my_dataset.to_parquet(f"datasets/qnli_vicuna_{SPLIT}_{VERSION}.parquet")

In [26]:
!ls -lah datasets | grep {VERSION} | grep qnli

drwxr-xr-x  3 augustas Domain Users  25K Jul 21 22:46 qnli_vicuna_ppo_training_raw_v1
-rw-r--r--  1 augustas Domain Users  19M Jul 21 23:18 qnli_vicuna_ppo_training_v1.parquet
drwxr-xr-x  3 augustas Domain Users  33K Jul 21 22:45 qnli_vicuna_train_raw_v1
-rw-r--r--  1 augustas Domain Users 2.0M Jul 21 23:02 qnli_vicuna_train_v1.parquet
-rw-r--r--  1 augustas Domain Users 1.1M Jul 21 23:26 qnli_vicuna_validation_v1.parquet
