In [1]:
import json
from tqdm import tqdm

# Load datasets
with open("merged_contexts.json", "r") as f:
    merged_contexts = json.load(f)

with open("full_answers.json", "r") as f:
    full_answers = json.load(f)

print(f"Loaded {len(merged_contexts)} contexts and {len(full_answers)} answer groups")

# Create mapping dictionary for quick context lookup
context_map = {ctx["base_chunk"]: ctx for ctx in merged_contexts}

# Structure the dataset
structured_data = []

for answer_group in tqdm(full_answers, desc="Processing answer groups"):
    # Get corresponding context from merged_contexts using context_id
    context_id = answer_group["context_id"]
    original_context = merged_contexts[context_id]
    
    for answer in answer_group["answers"]:
        # Create data row
        data_row = {
            "question": answer["question"],
            "answer": answer["answer"],
            "context": original_context["content"],
            "base_chunk": original_context["base_chunk"],
            "context_sections": original_context["context_sections"],
            "generation_timestamp_ns": answer["generation_time"]
        }
        
        structured_data.append(data_row)

print(f"Created {len(structured_data)} structured rows")

Loaded 89 contexts and 217 answer groups


Processing answer groups: 100%|██████████| 217/217 [00:00<00:00, 216964.00it/s]

Created 1343 structured rows





In [9]:
# Save final dataset
with open("hf_turboml_dataset.json", "w", encoding="utf-8") as f:
    json.dump(structured_data, f, indent=2, ensure_ascii=False)

print("Dataset saved to hf_turboml_dataset.json")

Dataset saved to hf_turboml_dataset.json


In [2]:
# Dataset statistics
total_questions = len(structured_data)
avg_context_length = sum(len(row["context"]) for row in structured_data) / total_questions
unique_sections = len(set(section for row in structured_data for section in row["context_sections"]))

print(f"""
Dataset Statistics:
- Total Q&A pairs: {total_questions:,}
- Average context length: {avg_context_length:,.0f} chars
- Unique documentation sections referenced: {unique_sections}
- Time range: {min(row["generation_timestamp_ns"] for row in structured_data)} to {max(row["generation_timestamp_ns"] for row in structured_data)}
""")


Dataset Statistics:
- Total Q&A pairs: 1,343
- Average context length: 48,265 chars
- Unique documentation sections referenced: 89
- Time range: 1740785987488093300 to 1740828912971381500



In [3]:
assert len(structured_data) == sum(len(g["answers"]) for g in full_answers)

In [4]:
sample_row = structured_data[0]
original_ctx = merged_contexts[[ctx["base_chunk"] for ctx in merged_contexts].index(sample_row["base_chunk"])]
assert sample_row["context"] == original_ctx["content"]

In [5]:
structured_data[0]

{'question': "Write code to ingest data from a Pandas DataFrame into TurboML using the `OnlineDataset.from_pd()` method, setting 'transactionID' as the key field and handling the case where the dataset already exists.",
 'answer': 'Okay, let\'s break down this task and address it using the provided TurboML documentation.\n\n**Problem Analysis:**\n\nThe user wants to ingest data from a Pandas DataFrame into TurboML.  They\'ve specified the `OnlineDataset.from_pd()` method, the key field (\'transactionID\'), and need to handle potential dataset pre-existence. The core task is data ingestion using the push-based method.\n\n**Step-by-Step Solution:**\n\n1.  **Import `turboml`:** Begin by importing the necessary library.\n2.  **Create DataFrame:** We\'ll assume a DataFrame named `df` is already available, as per the question.\n3.  **Instantiate `OnlineDataset`:** Use `OnlineDataset.from_pd()`, providing the DataFrame (`df`), a dataset ID (`id`), the `key_field`, and setting `load_if_exists=

In [11]:
!pip install -q huggingface-hub datasets


[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from huggingface_hub import login

login(token="your_api_token_here") 

In [4]:
from huggingface_hub import HfApi

api = HfApi()

# Upload main dataset file
api.upload_file(
    path_or_fileobj="TurboML_datasets/hf_turboml_dataset.json",
    path_in_repo="hf_turboml_dataset.json",
    repo_id="DebopamC/TurboML_Synthetic_QnA_Dataset",
    repo_type="dataset"
)

# # Upload README
# api.upload_file(
#     path_or_fileobj="README.md",
#     path_in_repo="README.md",
#     repo_id="DebopamC/TurboML_Synthetic_QnA_Dataset",
#     repo_type="dataset"
# )

hf_turboml_dataset.json:   0%|          | 0.00/74.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/DebopamC/TurboML_Synthetic_QnA_Dataset/commit/a281fb6903acbcce27ca098640888c169238a38f', commit_message='Upload hf_turboml_dataset.json with huggingface_hub', commit_description='', oid='a281fb6903acbcce27ca098640888c169238a38f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/DebopamC/TurboML_Synthetic_QnA_Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='DebopamC/TurboML_Synthetic_QnA_Dataset'), pr_revision=None, pr_num=None)