In [None]:
!pip install -q gcsfs pyyaml

In [None]:
!pip install git+https://github.com/meta-llama/synthetic-data-kit.git@main

Collecting git+https://github.com/meta-llama/synthetic-data-kit.git@main
  Cloning https://github.com/meta-llama/synthetic-data-kit.git (to revision main) to /tmp/pip-req-build-bl6wu2v7
  Running command git clone --filter=blob:none --quiet https://github.com/meta-llama/synthetic-data-kit.git /tmp/pip-req-build-bl6wu2v7
  Resolved https://github.com/meta-llama/synthetic-data-kit.git to commit 2e68548299df4383f1c5b34f3a9883e8840f9ac6
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
#
import vertexai, os
PROJECT_ID   = "poc-uni-t-plus"
LOCATION     = "us-central1"
BUCKET       = "qa-benchmark"
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# Create data folder and configs
!mkdir -p data/{pdf,html,youtube,docx,ppt,txt,output,generated,cleaned,final} configs

In [None]:
# transfer data from gcs to data folder here
!gsutil -m cp gs://qa-benchmark/raw/*.pdf data/pdf/

In [None]:
# ── Colab cell 4 ──
import textwrap, yaml, pathlib, json, os

cfg = textwrap.dedent(f"""
# configs/openai.yaml  ── just the LLM section ──
llm:
  provider: "api-endpoint"          # <- tells the kit we’re using HTTPS, not vLLM

api-endpoint:
  api_base: "https://api.openai.com/v1"   # OpenAI REST root
  api_key: ""
  model:    "gpt-4o-mini"
  max_retries: 3
  retry_delay: 1.0

generation:
  temperature: 0.5   # Higher = more creative, lower = more deterministic
  top_p: 0.95        # Nucleus sampling parameter
  chunk_size: 500   # Size of text chunks for processing
  overlap: 50       # Overlap between chunks to maintain context
  max_tokens: 10000   # Maximum tokens in LLM responses
  num_pairs: 30      # Default number of QA pairs to generate
  num_cot_examples: 5  # Default number of Chain of Thought examples to generate
  num_cot_enhance_examples: null  # Maximum number of conversations to enhance (null = enhance all)
  batch_size: 32     # Number of requests to batch together (for create)

# Content curation parameters
curate:
  threshold: 7.0     # Default quality threshold (1-10)
  batch_size: 32     # Number of items per batch for rating
  inference_batch: 32 # Number of batches to process at once with VLLM
  temperature: 0.1   # Temperature for rating (lower = more consistent)

# Format conversion parameters
format:
  default: "jsonl"   # Default output format
  include_metadata: true  # Include metadata in output files
  pretty_json: true  # Use indentation in JSON output

# Prompts for different tasks
prompts:
  # Summary generation prompt
  summary: |
    Summarize the provided document in 3–5 sentences. Your summary should clearly identify the main topic, highlight the most important concepts or procedures, and capture any essential technical details or requirements. Avoid including minor details or tangential information. Write your summary in clear, concise language, ensuring that it provides an accurate overview for someone unfamiliar with the original document

  # QA pair generation prompt
  qa_generation: |
    You are required to generate {num_pairs} complex and meaningful question-answer pairs from the provided document content to build a validation dataset. Follow these guidelines carefully: Please provide complete answers to the questions; do not give incomplete or partial answers. Each answer must provide all necessary details explicitly. Base your question on the technical aspects of the documents. Formulate realistic and meaningful questions that anticipate genuine inquiries a user might have regarding the technical procedures, rules, or detailed instructions described in the document. Avoid overly simplistic, trivial, or obvious questions. Ensure all questions directly address the technical aspects or detailed procedural content within the document. Do not create administrative or ownership-related

    Rules:
    1. Questions must be about important facts in the text
    2. Answers must be directly supported by the text
    3. Return JSON format only:

    [
      {{
        "question": "Question 1?",
        "answer": "Answer 1."
      }},
      {{
        "question": "Question 2?",
        "answer": "Answer 2."
      }}
    ]

    Text:
    {text}
  cot_generation: |
    Create {num_examples} complex reasoning examples from this text that demonstrate chain-of-thought thinking.

    Each example should have:
    1. A challenging question that requires step-by-step reasoning
    2. Detailed reasoning steps that break down the problem
    3. A concise final answer

    Return JSON format only:

    [
      {{
        "question": "Complex question about the text?",
        "reasoning": "Step 1: First, I need to consider...\nStep 2: Then, I analyze...\nStep 3: Finally, I can conclude...",
        "answer": "Final answer based on the reasoning."
      }},
      {{
        "question": "Another complex question?",
        "reasoning": "Step 1: First, I'll analyze...\nStep 2: Next, I need to determine...\nStep 3: Based on this analysis...",
        "answer": "Final answer drawn from the reasoning."
      }}
    ]

    Text:
    {text}

""")

pathlib.Path("configs/config.yaml").write_text(cfg)
print(cfg)


In [None]:
import os, getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("Paste your OpenAI API key and hit Enter: ")


In [None]:
!synthetic-data-kit -c configs/config.yaml system-check

In [None]:
# ── Colab cell 5 ──
!synthetic-data-kit -c configs/config.yaml ingest data/pdf/GUI-A20-010-Job-Aid.pdf

# → writes parsed text to data/output/sample_report.txt


In [None]:
# ── Colab cell 6 ──
!synthetic-data-kit -c configs/config.yaml create data/output/GUI-A20-010-Job-Aid.txt \
        --type qa -n 10
# generates → data/generated/sample_report_qa_pairs.json

In [None]:
# ── Colab cell 7 ──
!synthetic-data-kit -c configs/config.yaml curate \
        data/output/GUI-A20-010-Job-Aid_qa_pairs.json
# keeps high-quality examples → data/cleaned/sample_report_cleaned.json


In [None]:
%%bash
CONFIG="configs/config.yaml"
mkdir -p data/output data/generated data/cleaned   # just in case

for pdf in data/pdf/*.pdf; do
  base=$(basename "$pdf" .pdf)

  synthetic-data-kit -c "$CONFIG" ingest "$pdf" \
                     --output-dir data/output

  synthetic-data-kit -c "$CONFIG" create "data/output/${base}.txt" \
                     --type qa -n 20 \
                     --output-dir data/generated

  synthetic-data-kit -c "$CONFIG" curate \
                     "data/generated/${base}_qa_pairs.json" \
                     --threshold 7.5

  synthetic-data-kit -c "$CONFIG" save-as \
                     "data/generated/${base}_qa_pairs.json" \
                     --format chatml \
                     --output "data/cleaned/${base}_cleaned.json"
done
echo "✅  done"


In [None]:

SOURCE_DIR = "data/generated"          # folder that holds the 21 files
PATTERN    = f"{SOURCE_DIR}/*.json"    # they all end with .json

import json, glob, os, pandas as pd
from google.colab import files

rows = []

for path in sorted(glob.glob(PATTERN)):
    with open(path, encoding="utf-8") as f:
        doc = json.load(f)             # {"summary": "...", "qa_pairs": [...]}

    qa_list = doc.get("qa_pairs", [])
    for pair in qa_list:
        rows.append({
            "question": pair.get("question", "").strip(),
            "answer"  : pair.get("answer", "").strip(),
            "file"    : os.path.basename(path)         # keep source file name
        })

# build the DataFrame (drop "file" if you don't need it)
df = pd.DataFrame(rows, columns=["question", "answer", "file"])

# save everything to one Excel workbook
out_path = "/content/all_qa_pairs.xlsx"
df.to_excel(out_path, index=False)

# show a quick preview in the notebook
df.head(10)


In [None]:
!pip install -q openpyxl

In [None]:
from google.colab import files
files.download("/content/all_qa_pairs.xlsx")


In [None]:
# 2 – SET THIS to the folder that contains all your JSON files
SOURCE_DIR = "data/cleaned"          # e.g. "data/generated" or "data/output"
PATTERN    = f"{SOURCE_DIR}/*.json"    # tweak if your filenames differ

import json, glob, os, pandas as pd, re
from google.colab import files

rows = []

def add_pair(q, a):
    """Helper to append a clean Q-A row if both parts exist."""
    q, a = (q or "").strip(), (a or "").strip()
    if q and a:
        rows.append({"question": q, "answer": a})

# ---------- read every JSON file ----------
for path in sorted(glob.glob(PATTERN)):
    with open(path, encoding="utf-8") as f:
        # Some files may contain many JSON objects, one per line
        raw = f.read().strip()
        if not raw:
            continue

    # try to load the whole file; if that fails assume NDJSON (one-per-line)
    try:
        objs = [json.loads(raw)]
    except json.JSONDecodeError:
        objs = [json.loads(line) for line in raw.splitlines() if line.strip()]

    for obj in objs:
        # -------- Structure A ------------------------------------------------
        if isinstance(obj, dict) and "qa_pairs" in obj:
            for qa in obj["qa_pairs"]:
                add_pair(qa.get("question"), qa.get("answer"))

        # -------- Structure B ------------------------------------------------
        elif isinstance(obj, dict) and "messages" in obj:
            msgs = obj["messages"]
            pending_q = None
            for m in msgs:
                role = m.get("role")
                text = m.get("content", "").strip()
                if role == "user":
                    pending_q = text
                elif role == "assistant" and pending_q:
                    add_pair(pending_q, text)
                    pending_q = None

        # -------- Unknown structure -----------------------------------------
        else:
            print(f"⚠️ Skipped unrecognised format in {os.path.basename(path)}")

# ---------- build DataFrame & export ----------
if not rows:
    raise ValueError("No question-answer pairs found. Check SOURCE_DIR & patterns.")

df = pd.DataFrame(rows, columns=["question", "answer"])
excel_path = "/content/data/final/all_qa_pairs.xlsx"
df.to_excel(excel_path, index=False)

# quick preview
df.head(10)
