In [None]:
!pip install -q gcsfs pyyaml openpyxl

In [None]:
!pip install -q git+https://github.com/meta-llama/synthetic-data-kit.git@main

Collecting git+https://github.com/meta-llama/synthetic-data-kit.git@main
  Cloning https://github.com/meta-llama/synthetic-data-kit.git (to revision main) to /tmp/pip-req-build-nf28cg3m
  Running command git clone --filter=blob:none --quiet https://github.com/meta-llama/synthetic-data-kit.git /tmp/pip-req-build-nf28cg3m
  Resolved https://github.com/meta-llama/synthetic-data-kit.git to commit 2e68548299df4383f1c5b34f3a9883e8840f9ac6
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting bootstrap-flask>=2.2.0 (from synthetic-data-kit==0.0.4b2)
  Downloading bootstrap_flask-2.5.0-py3-none-any.whl.metadata (6.1 kB)
Collecting flask-wtf>=1.0.0 (from synthetic-data-kit==0.0.4b2)
  Downloading flask_wtf-1.2.2-py3-none-any.whl.metadata (3.4 kB)
Collecting pdfminer-six>=20221105 (from synthetic-data-kit==0.0.4b2)
  Downloading pdfminer_six-20250506-py3-none-any.whl.me

In [None]:
# GCP project credentials
import vertexai, os
PROJECT_ID   = "poc-uni-t-plus"
LOCATION     = "us-central1"
BUCKET       = "qa-benchmark"
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# Create folders here
!mkdir -p data/{pdf,html,youtube,docx,ppt,txt,output,generated,cleaned,final,result} configs

In [None]:
## transfer data from gcs to data folder here
!gsutil -m cp gs://qa-benchmark/raw/*.pdf data/pdf/

Copying gs://qa-benchmark/raw/GUI-A20-010 Job Aid – How to Upload and Reference a Document in SharePoint (corporate documents)_EN_V1.0.pdf...
Copying gs://qa-benchmark/raw/GUI-A20-100 Job Aid - SharePoint and Contractual Documents_EN_V1.1.pdf...
/ [0/21 files][    0.0 B/ 23.1 MiB]   0% Done                                   / [0/21 files][    0.0 B/ 23.1 MiB]   0% Done                                   Copying gs://qa-benchmark/raw/GUI-B31-100 Job Aid - Opening a Sales Project_EN_V1.1.pdf...
/ [0/21 files][    0.0 B/ 23.1 MiB]   0% Done                                   Copying gs://qa-benchmark/raw/GUI-A61-100 Job Aid - Project Risk Analysis_EN_V1.0.pdf...
/ [0/21 files][    0.0 B/ 23.1 MiB]   0% Done                                   Copying gs://qa-benchmark/raw/GUI-B31-110 Job Aid - Opening a Client Project in GP for Project Accounting_EN_V1.2.pdf...
Copying gs://qa-benchmark/raw/GUI-C20-001 Job Aid - Create a Custom Dashboard_EN_V1.0.pdf...
/ [0/21 files][    0.0 B/ 23.1 MiB] 

In [None]:
# Config file
import textwrap, yaml, pathlib, json, os

cfg = textwrap.dedent("""
llm:
  provider: "api-endpoint"          # <- tells the kit we’re using HTTPS, not vLLM

api-endpoint:
  api_base: "https://api.openai.com/v1"   # OpenAI REST root
  api_key: ""
  model:    "gpt-4o-mini"
  max_retries: 3
  retry_delay: 1.0

generation:
  temperature: 0.5   # Higher = more creative, lower = more deterministic
  top_p: 0.95        # Nucleus sampling parameter
  chunk_size: 500   # Size of text chunks for processing
  overlap: 50       # Overlap between chunks to maintain context
  max_tokens: 10000   # Maximum tokens in LLM responses
  num_pairs: 30      # Default number of QA pairs to generate
  num_cot_examples: 20  # Default number of Chain of Thought examples to generate
  num_cot_enhance_examples: null  # Maximum number of conversations to enhance (null = enhance all)
  batch_size: 32     # Number of requests to batch together (for create)

# Content curation parameters
curate:
  threshold: 7.0     # Default quality threshold (1-10)
  batch_size: 32     # Number of items per batch for rating
  inference_batch: 32 # Number of batches to process at once with VLLM
  temperature: 0.1   # Temperature for rating (lower = more consistent)

# Format conversion parameters
format:
  default: "jsonl"   # Default output format
  include_metadata: true  # Include metadata in output files
  pretty_json: true  # Use indentation in JSON output

# Prompts for different tasks
prompts:
  # Summary generation prompt
  summary: |
    Summarize the provided document in 3–5 sentences. Your summary should clearly identify the main topic, highlight the most important concepts or procedures, and capture any essential technical details or requirements. Avoid including minor details or tangential information. Write your summary in clear, concise language, ensuring that it provides an accurate overview for someone unfamiliar with the original document

  # QA pair generation prompt
  qa_generation: |
    You are required to generate {num_pairs} complex and meaningful question-answer pairs from the provided document content to build a validation dataset. Follow these guidelines carefully: Please provide complete answers to the questions; do not give incomplete or partial answers. Each answer must provide all necessary details explicitly. Base your question on the technical aspects of the documents. Formulate realistic and meaningful questions that anticipate genuine inquiries a user might have regarding the technical procedures, rules, or detailed instructions described in the document. Avoid overly simplistic, trivial, or obvious questions. Ensure all questions directly address the technical aspects or detailed procedural content within the document. Do not create administrative or ownership-related

    Rules:
    1. Questions must be about important facts in the text
    2. Answers must be directly supported by the text
    3. Return JSON format only:

    [
      {{
        "question": "Question 1?",
        "answer": "Answer 1."
      }},
      {{
        "question": "Question 2?",
        "answer": "Answer 2."
      }}
    ]

    Text:
    {text}

  qa_rating: |
    Rate each question-answer pair on a scale from 1-10, based on:
    - Accuracy (0-3): factual correctness
    - Relevance (0-2): relevance to content
    - Clarity (0-2): clear language
    - Usefulness (0-3): value for model learning

    YOU MUST RETURN A VALID JSON OBJECT OR ARRAY WITH THIS EXACT SCHEMA:
    {{
      "question": "Exact question text",
      "answer": "Exact answer text",
      "rating": 8
    }}

    OR FOR MULTIPLE PAIRS:
    [
      {{"question": "Q1", "answer": "A1", "rating": 8}},
      {{"question": "Q2", "answer": "A2", "rating": 9}}
    ]

  # Chain of Thought generation prompt
  cot_generation: |
    Create {num_examples} complex reasoning examples from this text that demonstrate chain-of-thought thinking.

    Each example should have:
    1. A challenging question that requires step-by-step reasoning
    2. Detailed reasoning steps that break down the problem
    3. A concise final answer

    Return JSON format only:

    [
      {{
        "question": "Complex question about the text?",
        "reasoning": "Step 1: First, I need to consider...\nStep 2: Then, I analyze...\nStep 3: Finally, I can conclude...",
        "answer": "Final answer based on the reasoning."
      }},
      {{
        "question": "Another complex question?",
        "reasoning": "Step 1: First, I'll analyze...\nStep 2: Next, I need to determine...\nStep 3: Based on this analysis...",
        "answer": "Final answer drawn from the reasoning."
      }}
    ]

    Text:
    {text}

""")

pathlib.Path("configs/config.yaml").write_text(cfg)
print(cfg)



llm:
  provider: "api-endpoint"          # <- tells the kit we’re using HTTPS, not vLLM

api-endpoint:
  api_base: "https://api.openai.com/v1"   # OpenAI REST root
  api_key: "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"
  model:    "gpt-4o-mini"
  max_retries: 3
  retry_delay: 1.0

generation:
  temperature: 0.5   # Higher = more creative, lower = more deterministic
  top_p: 0.95        # Nucleus sampling parameter
  chunk_size: 500   # Size of text chunks for processing
  overlap: 50       # Overlap between chunks to maintain context
  max_tokens: 10000   # Maximum tokens in LLM responses
  num_pairs: 30      # Default number of QA pairs to generate
  num_cot_examples: 20  # Default number of Chain of Thought examples to generate
  num_cot_enhance_examples: null  # Maximum number of conversations to enhance (null = enhance all)
  batch_size: 32     # Number of 

In [None]:
import os, getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("Paste your OpenAI API key and hit Enter: ")


Paste your OpenAI API key and hit Enter: ··········


In [None]:
!synthetic-data-kit -c configs/config.yaml system-check

Loading config from: /usr/local/lib/python3.11/dist-packages/synthetic_data_kit/config.yaml
Config has LLM provider set to: api-endpoint
Loading config from: configs/config.yaml
Config has LLM provider set to: api-endpoint
[1;34mEnvironment variable check:[0m
API_ENDPOINT_KEY: Not found
get_llm_provider returning: api-endpoint
API_ENDPOINT_KEY environment variable: Not found
[2K[32m⠦[0m Checking API endpoint access...INFO:httpx:HTTP Request: GET https://api.openai.com/v1/models "HTTP/1.1 200 OK"
[2K[32m API endpoint access confirmed[0m
[2K[32mUsing custom API base: [0m[4;94mhttps://api.openai.com/v1[0m
[2K[32mDefault model: gpt-4o-mini[0m
[2K[32m⠧[0m Checking API endpoint access...
[1A[2K

In [None]:
%%bash
CONFIG="configs/config.yaml"
# mkdir -p data/output data/generated data/cleaned   # just in case

for pdf in data/pdf/*.pdf; do
  base=$(basename "$pdf" .pdf)

  synthetic-data-kit -c "$CONFIG" ingest "$pdf" \
                     --output-dir data/output

  synthetic-data-kit -c "$CONFIG" create "data/output/${base}.txt" \
                     --type qa -n 20 \
                     --output-dir data/generated

  synthetic-data-kit -c "$CONFIG" curate \
                     "data/generated/${base}_qa_pairs.json" \
                     --threshold 7.5

  synthetic-data-kit -c "$CONFIG" save-as \
                     "data/generated/${base}_qa_pairs.json" \
                     --format chatml \
                     --output "data/cleaned/${base}_cleaned.json"
done
echo "✅  done"


Loading config from: /usr/local/lib/python3.11/dist-packages/synthetic_data_kit/config.yaml
Config has LLM provider set to: api-endpoint
Loading config from: configs/config.yaml
Config has LLM provider set to: api-endpoint
 Text successfully extracted to data/output/GUI-A20-010 Job Aid – How to Upload 
and Reference a Document in SharePoint (corporate documents)_EN_V1.0.txt
Loading config from: /usr/local/lib/python3.11/dist-packages/synthetic_data_kit/config.yaml
Config has LLM provider set to: api-endpoint
Loading config from: configs/config.yaml
Config has LLM provider set to: api-endpoint
get_llm_provider returning: api-endpoint
L Using api-endpoint provider
Loading config from: configs/config.yaml
Config has LLM provider set to: api-endpoint
API_ENDPOINT_KEY from environment: Not found
Using API key: None
No API key found!
Using API base URL: https://api.openai.com/v1
L Using api-endpoint provider
Loading config from: configs/config.yaml
Config has LLM provider set to: api-endpoin

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

In [None]:
# folder that holds your ChatML JSON files
SOURCE_DIR = "data/cleaned"          # change if different
PATTERN    = f"{SOURCE_DIR}/*.json"  # leave as-is unless extension differs

import json, glob, os, pandas as pd, pathlib
from google.colab import files

rows = []

for path in sorted(glob.glob(PATTERN)):
    with open(path, encoding="utf-8") as f:
        raw = f.read().strip()

    # Some files might hold multiple JSON objects (ndjson)
    try:
        objs = [json.loads(raw)]
    except json.JSONDecodeError:
        objs = [json.loads(line) for line in raw.splitlines() if line.strip()]

    for obj in objs:
        msgs = obj.get("messages", [])
        user_q = None
        for m in msgs:
            role = m.get("role")
            content = m.get("content", "").strip()
            if role == "user":
                user_q = content
            elif role == "assistant" and user_q:
                rows.append({"question": user_q, "answer": content})
                break    # we only need the first Q-A pair

# ▶ build DataFrame
df = pd.DataFrame(rows, columns=["question", "answer"])

# ▶ save to Excel
out_dir  = pathlib.Path("/content/result"); out_dir.mkdir(exist_ok=True)
out_path = out_dir / "all_qa_pairs.xlsx"
df.to_excel(out_path, index=False)

# ▶ quick preview + download link
print(df.head(10))
files.download(str(out_path))


                                            question  \
0  What is the purpose of the GUI-A20-010 Job Aid...   
1  What steps should be taken to confirm if a doc...   
2  How can a user narrow down search results when...   
3  What steps can be taken to filter search resul...   
4  How can you determine the location of a docume...   
5  What is the procedure for uploading a document...   
6  What is the primary location for uploading a c...   
7  How should a bilingual or unilingual document ...   
8  What should be done if a document is bilingual...   
9  What steps should be taken to clean up duplica...   

                                              answer  
0  The purpose of the GUI-A20-010 Job Aid is to o...  
1  To confirm if a document is already uploaded t...  
2  A user can narrow down search results by using...  
3  You can use the tabs to filter the search resu...  
4  If you find multiple files, you can click or h...  
5  To reduce the risk of obsolete documents, do n... 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>