In [3]:
import json
import pandas as pd
from datasets import Dataset
import os
from glob import glob

# Function to convert QA pairs to Alpaca format
def convert_to_alpaca_format(qa_pairs):
    alpaca_data = []
    
    for pair in qa_pairs:
        # Create Alpaca format entry
        entry = {
            "instruction": pair["question"],
            "input": "",  # Empty input since we're doing direct Q&A
            "output": pair["answer"]
        }
        alpaca_data.append(entry)
    
    return alpaca_data

# Load the JSON file
def load_qa_pairs(file_path):
    with open(file_path, 'r') as f:
        qa_pairs = json.load(f)
    return qa_pairs

# Main processing
def create_alpaca_dataset(directory_path):
    all_alpaca_data = []
    
    # Find all qa_pairs.json files in the directory and subdirectories
    json_files = glob(os.path.join(directory_path, "**/*qa_pairs.json"), recursive=True)
    
    print(f"Found {len(json_files)} QA pair files")
    
    # Process each file
    for json_file in json_files:
        qa_pairs = load_qa_pairs(json_file)
        alpaca_data = convert_to_alpaca_format(qa_pairs)
        all_alpaca_data.extend(alpaca_data)
    
    # Convert to pandas DataFrame
    df = pd.DataFrame(all_alpaca_data)
    
    # Convert to HuggingFace Dataset
    dataset = Dataset.from_pandas(df)
    
    return dataset

# Example usage
directory_path = "data/QA Pairs"  # Adjust this to your QA pairs directory
dataset = create_alpaca_dataset(directory_path)

# Print some statistics
print("\nDataset statistics:")
print(f"Total number of examples: {len(dataset)}")
print("\nSample entry:")
print(dataset)

# Save the dataset if needed
# dataset.save_to_disk("path/to/save/dataset")

Found 45 QA pair files

Dataset statistics:
Total number of examples: 222

Sample entry:
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 222
})


In [12]:
def format_alpaca_prompt(example):
    # Check if input field has content
    has_input = example["input"] and example["input"].strip()
    
    if has_input:
        text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{example["instruction"]}<|eot_id|><|start_header_id|>user<|end_header_id|>

{example["input"]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{example["output"]}<|eot_id|>"""
    else:
        text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{example["instruction"]}<|eot_id|><|start_header_id|>user<|end_header_id|>

{example["output"]}<|eot_id|>"""
    
    return {"text": text}

# Apply the function
dataset = dataset.map(format_alpaca_prompt)

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

In [13]:
dataset['text']

["<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nWhat are the Stone Roads, and how did Azaersi create them?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nThe Stone Roads are magical pathways into nations across Golarion, created by Azaersi using the Onyx Key, stolen from the dwarven citadel of Kraggodan. These pathways act as 'shortcuts' through the elemental plane of Earth.<|eot_id|>",
 '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nWhat is significant about the hobgoblin nation of Kaoling in relation to Oprak?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nKaoling is a Tian hobgoblin nation that offers Oprak a model of stable, long-established hobgoblin laws and society for Oprak to study and potentially emulate. It also provides rich trade opportunities.<|eot_id|>',
 "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nHow is Azaersi navigating Oprak's relationships with human nations, given her animosity towards humankind?<|e

In [None]:
chat_template = """Below are some instructions that describe some tasks. Write responses that appropriately complete each request.

### Instruction:
{INPUT}

### Response:
{OUTPUT}"""

from unsloth import apply_chat_template

dataset = apply_chat_template(
    dataset,
    tokenizer=tokenizer,
    chat_template=chat_template,
    # default_system_message = "You are a helpful assistant", << [OPTIONAL]
)