# DPO Dataset Building - Ultrafeedback

For a list of gramatically incorrect sentences sampled from the train dataset, genarate a chosen and rejected correction, using LLMs.

- Chosen: GPT-4
- Rejected: Mistral 8x7B Instruct

## Downloading required packages

In [None]:
!pip install datasets openai

## Importing required modules

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, concatenate_datasets
from openai import OpenAI
import requests
from huggingface_hub import login as hf_login
from tqdm import tqdm

## Helper Functions

### GPT-4

In [None]:
openai_api_key = input("Enter OpenAI API key: ")
openai_client = OpenAI(api_key=openai_api_key)

In [None]:
chosen_list = []
def get_gpt4_outputs(start, size):
    prompt = "Rewrite the given text without grammatical, spelling and punctuation errors. Make as few corrections as possible. Give only the corrected version of the text."

    with tqdm(total=size) as pbar:
        for txt in train_dataset[start: start + size]['text']:
            completion = openai_client.chat.completions.create(
                model="gpt-4-turbo-preview",
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": txt.strip()}
                ],
                temperature=0.5,
                max_tokens=512,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0
            )

            chosen_list.append(completion.choices[0].message.content.strip())
            pbar.update(1)

### Mistral 8x7B Instruct

In [None]:
hf_token = input("Enter HuggingFace token: ")

In [None]:
MISTRAL_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1"
headers = {"Authorization": f"Bearer {hf_token}"}

rejected_list = []
def get_mistral_outputs(start, size):
    prompt = "Rewrite the given text without grammatical, spelling and punctuation errors. Make as few corrections as possible. The text to be corrected begins after 'Text:'. Give only the corrected version of the text. Text: "

    with tqdm(total=size) as pbar:
        for txt in train_dataset[start: start + size]['text']:
            input_text = f"<s>[INST]{prompt}{txt.strip()}[/INST]"
            payload = {
                "inputs": input_text,
                "parameters": {
                    "max_new_tokens": 512
                }
            }
            response = requests.post(MISTRAL_API_URL, headers=headers, json=payload).json()

            while not(response) or type(response) != list or len(response) == 0:
                response = requests.post(MISTRAL_API_URL, headers=headers, json=payload).json()

            rejected_list.append(response[0]['generated_text'][len(input_text):].strip())
            pbar.update(1)

## Building the DPO dataset

### Get the WI-LOCNESS dataset

In [None]:
raw_datasets = load_dataset("wi_locness", 'wi')

In [None]:
dataset_dict = raw_datasets["train"].train_test_split(test_size=0.1, seed=0)
raw_datasets["train"] = dataset_dict["train"]
raw_datasets["test"] = dataset_dict["test"]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t")
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'unk_token': '[UNK]'})
raw_datasets = raw_datasets.filter(lambda x: len(tokenizer.encode(x["text"])) <= 450)

In [None]:
train_dataset = raw_datasets["train"]
train_dataset = train_dataset.remove_columns(column_names=["id", "userid", "cefr", "edits"])

### Build dataset

In [None]:
START = 0
DATASET_SIZE = 2000

In [None]:
hf_login()

In [None]:
get_mistral_outputs(START, DATASET_SIZE)

In [None]:
get_gpt4_outputs(START, DATASET_SIZE)

In [None]:
dpo_dataset_dict = {
    "chosen": chosen_list,
    "prompt": train_dataset[START: START + DATASET_SIZE]['text'],
    "rejected": rejected_list
}

dpo_dataset = Dataset.from_dict(dpo_dataset_dict)

In [None]:
dpo_dataset

### Overwrite the dataset

In [None]:
dpo_dataset.push_to_hub(repo_id = "AY2324S2-CS4248-Team-47/gec-dpo-ultrafeedback")

### Append to the dataset

In [None]:
existing_dataset = load_dataset("AY2324S2-CS4248-Team-47/gec-dpo-ultrafeedback")['train']
new_dataset = concatenate_datasets([existing_dataset, dpo_dataset])

In [None]:
new_dataset.push_to_hub(repo_id = "AY2324S2-CS4248-Team-47/gec-dpo-ultrafeedback")