### Spinach Datasets

In [5]:
import os
import json
import pandas as pd

# Load config
with open("config.json", "r") as f:
    CONFIG = json.load(f)

input_path = CONFIG.get("input_path", "../data/Spinach")
output_path = CONFIG.get("output_path", "./combined_question.tsv")

# Load and merge into a list of rows
records = []

for filename in os.listdir(input_path):
    if filename.endswith(".json"):
        file_path = os.path.join(input_path, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

                # If it's a list of items
                if isinstance(data, list):
                    for item in data:
                        if "question" in item:
                            records.append({
                                "question": item["question"],
                                "sparql": item.get("sparql")  # None if missing
                            })
                # If it's a single dict
                elif isinstance(data, dict) and "question" in data:
                    records.append({
                        "question": data["question"],
                        "sparql": data.get("sparql")  # None if missing
                    })

        except Exception as e:
            print(f"❌ Error in {filename}: {e}")

# Convert to DataFrame
df = pd.DataFrame(records)

# Save as TSV
df.to_csv(output_path + ".tsv", sep="\t", index=False)
print(f"✅ Saved {len(df)} rows to {output_path}")


✅ Saved 3848 rows to ../../data/Spinach/spinach_output


In [6]:
import json
import pandas as pd
from tqdm import tqdm
from agent_workflow import run_agent_workflow



processed_results = []

question_data = df.to_dict(orient='records')
for i in tqdm(range(len(question_data)), desc="Processing"):
    item = question_data[i]
    q1 = item.get("question")
    usparql = item.get("usparql", f"no_usparql_{i}")
    if i%100 == 0:
        df = pd.DataFrame(processed_results)
        df.to_csv(output_path + "_generated.csv", index=False)
        df.to_csv(output_path + "_generated.tsv", sep="\t", index=False)
    try:
        result = run_agent_workflow(q1)
        if result:
            processed_results.append({"usparql": usparql, **result})
    except Exception as e:
        print(f"❌ Error at index {i}: {e}")
        continue

df = pd.DataFrame(processed_results)
df.to_csv(output_path + "_generated.csv", index=False)
df.to_csv(output_path + "_generated.tsv", sep="\t", index=False)

print(f"\n✅ Saved {len(df)} rows to:\n- {output_path}.csv\n- {output_path}.tsv")



Processing: 100%|██████████| 2018/2018 [18:47<00:00,  1.79it/s]


✅ Saved 277 rows to:
- ../../data/Spinach/spinach_output.csv
- ../../data/Spinach/spinach_output.tsv



