# DPO Dataset Building - Ultrafeedback

For a list of gramatically incorrect sentences sampled from the train dataset, genarate a chosen and rejected correction, using LLMs.

- Chosen: GPT-4
- Rejected: Mistral 7B Instruct

## Downloading required packages

In [None]:
!pip install datasets openai

## Importing required modules

In [None]:
from datasets import load_dataset, Dataset
from openai import OpenAI
import requests
from huggingface_hub import login as hf_login

## Helper Functions

### GPT-4

In [None]:
openai_api_key = input("Enter OpenAI API key: ")
openai_client = OpenAI(api_key=openai_api_key)

In [None]:
def get_gpt4_outputs(size):
    answers = []
    prompt = "Rewrite the given text without grammatical, spelling and punctuation errors. Make as few corrections as possible. Give only the corrected version of the text."

    for txt in train_dataset[:size]['text']:
        completion = openai_client.chat.completions.create(
            model="gpt-4-turbo-preview",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": txt.strip()}
            ],
            temperature=0.5,
            max_tokens=512,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )

        answers.append(completion.choices[0].message.content.strip())

    return answers

### Mistral 7B Instruct

In [None]:
hf_token = input("Enter HuggingFace token: ")

In [None]:
MISTRAL_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
headers = {"Authorization": f"Bearer {hf_token}"}


def get_mistral_outputs(size):
    prompt = "Rewrite the given text without grammatical, spelling and punctuation errors. Make as few corrections as possible. The text to be corrected begins after 'Text:'. Give only the corrected version of the text. Text: "
    answers = []

    for txt in train_dataset[:size]['text']:
        input_text = f"<s>[INST]{prompt}{txt.strip()}[/INST]"
        payload = {
            "inputs": input_text,
            "parameters": {
                "max_new_tokens": 512
            }
        }
        response = requests.post(MISTRAL_API_URL, headers=headers, json=payload).json()

        answers.append(response[0]['generated_text'][len(input_text):].strip())

    return answers

## Building the DPO dataset

### Get the WI-LOCNESS dataset

In [None]:
raw_datasets = load_dataset("wi_locness", 'wi')

In [None]:
dataset_dict = raw_datasets["train"].train_test_split(test_size=0.1, seed=0)
raw_datasets["train"] = dataset_dict["train"]
raw_datasets["test"] = dataset_dict["test"]

In [None]:
train_dataset = raw_datasets["train"]
train_dataset = train_dataset.remove_columns(column_names=["id", "userid", "cefr", "edits"])

### Build dataset

In [None]:
DATASET_SIZE = 500

In [None]:
rejected_list = get_mistral_outputs(DATASET_SIZE)

In [None]:
chosen_list = get_gpt4_outputs(DATASET_SIZE)

In [None]:
dpo_dataset_dict = {
    "chosen": chosen_list,
    "prompt": train_dataset[:DATASET_SIZE]['text'],
    "rejected": rejected_list
}

In [None]:
hf_login()

In [None]:
dpo_dataset = Dataset.from_dict(dpo_dataset_dict)
dpo_dataset.push_to_hub(repo_id = "AY2324S2-CS4248-Team-47/gec-dpo-ultrafeedback")