## Data Transformation and Finetuning with OpenPipe

In [2]:
import json
import random
from openpipe import OpenAI
from prompts.prompts import qp_system_prompt, cp_system_prompt
from utils.save_and_load_json import load_all_json_files, save_jsonl
from utils.transfrom_data_for_finetuing import split_dataset, format_qp_for_openpipe_finetuning, format_cp_for_openpipe_finetuning

# load in datasets
logic701 = load_all_json_files(folder_path='./data/train/raw')
gpt_one_call = load_all_json_files(folder_path='./data/synthetic/processed/gpt_one_call')
gpt_two_calls = load_all_json_files(folder_path='./data/synthetic/processed/gpt_two_calls')
sudoku = load_all_json_files(folder_path='./data/synthetic/processed/sudoku_4x4')

Loaded 1 JSON files from ./data/train/raw
Loaded 1 JSON files from ./data/synthetic/processed/gpt_one_call
Loaded 1 JSON files from ./data/synthetic/processed/gpt_two_calls
Loaded 1 JSON files from ./data/synthetic/processed/sudoku_4x4


In [3]:
# Set seed for reproducibility
random.seed(42)

# Get 250 random samples
logic701_samples = random.sample(logic701[0], 250)
sudoku_samples = random.sample(sudoku[0], 250)
gpt_one_call_samples = random.sample(gpt_one_call[0], 250)
gpt_two_call_samples = random.sample(gpt_two_calls[0], 250)

print('verifying dataset len samples:')
print(len(logic701_samples))
print(len(sudoku_samples))
print(len(gpt_one_call_samples))
print(len(gpt_two_call_samples))

# create qp and cp splits
logic701_samples_qp, logic701_samples_cp = split_dataset(logic701_samples)
sudoku_samples_qp, sudoku_samples_cp = split_dataset(sudoku_samples)
gpt_one_call_samples_qp, gpt_one_call_samples_cp = split_dataset(gpt_one_call_samples)
gpt_two_call_samples_qp, gpt_two_call_samples_cp = split_dataset(gpt_two_call_samples)

print('verifying dataset len for qp and cp:')
for dataset in [logic701_samples_qp, logic701_samples_cp, sudoku_samples_qp, sudoku_samples_cp, gpt_one_call_samples_qp, gpt_two_call_samples_qp, gpt_two_call_samples_cp]: print(len(dataset))

verifying dataset len samples:
250
250
250
250
verifying dataset len for qp and cp:
250
250
250
250
250
250
250


In [4]:
# format data into OpenAI finetuning format for OpenPipe
logic701_formatted_qp = format_qp_for_openpipe_finetuning(logic701_samples_qp, system_prompt=qp_system_prompt)
logic701_formatted_cp = format_cp_for_openpipe_finetuning(logic701_samples_cp, system_prompt=cp_system_prompt)

sudoku_formatted_qp = format_qp_for_openpipe_finetuning(sudoku_samples_qp, system_prompt=qp_system_prompt)
sudoku_formatted_cp = format_cp_for_openpipe_finetuning(sudoku_samples_cp, system_prompt=cp_system_prompt)

gpt_one_call_formatted_qp = format_qp_for_openpipe_finetuning(gpt_one_call_samples_qp, system_prompt=qp_system_prompt)
gpt_one_call_formatted_cp = format_cp_for_openpipe_finetuning(gpt_one_call_samples_cp, system_prompt=qp_system_prompt)

gpt_two_call_formatted_qp = format_qp_for_openpipe_finetuning(gpt_two_call_samples_qp, system_prompt=qp_system_prompt)
gpt_two_call_formatted_cp = format_cp_for_openpipe_finetuning(gpt_two_call_samples_cp, system_prompt=qp_system_prompt)

# save data as jsonl
save_jsonl(logic701_formatted_qp, "./data/synthetic/finetuning/logic701_finetuning_qp.jsonl")
save_jsonl(logic701_formatted_cp, "./data/synthetic/finetuning/logic701_finetuning_cp.jsonl")

save_jsonl(sudoku_formatted_qp, "./data/synthetic/finetuning/sudoku_finetuning_qp.jsonl")
save_jsonl(sudoku_formatted_cp, "./data/synthetic/finetuning/sudoku_finetuning_cp.jsonl")

save_jsonl(gpt_one_call_formatted_qp, "./data/synthetic/finetuning/gpt_one_call_finetuning_qp.jsonl")
save_jsonl(gpt_one_call_formatted_cp, "./data/synthetic/finetuning/gpt_one_call_finetuning_cp.jsonl")

save_jsonl(gpt_two_call_formatted_qp, "./data/synthetic/finetuning/gpt_two_call_finetuning_qp.jsonl")
save_jsonl(gpt_two_call_formatted_cp, "./data/synthetic/finetuning/gpt_two_call_finetuning_cp.jsonl")