In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Normalization: produce a 3-column CSV with:
  - query: natural language question
  - tools: JSON string (list of tool specs)
  - gold_call: JSON string {"name": ..., "arguments": {...}}

Input preference:
  1) data/raw/apigen.csv if present
  2) else: load from HF datasets API

Output:
  data/processed/apigen_normalized.csv
"""
import os, json, argparse
from typing import Any, Dict, Optional
import pandas as pd
try:
    from datasets import load_dataset
except Exception:
    load_dataset = None

RAW_PATH = "data/raw/apigen.csv"
OUT_PATH = "data/processed/apigen_normalized.csv"

# 兼容常见变体列名
QUERY_KEYS = [
    "query", "prompt", "question", "input", "instruction", "user_query", "inputs", "task"
]
TOOLS_KEYS = [
    "tools", "tool_definitions", "tool_defs", "tools_json", "tool_schema",
    "function_definitions", "available_tools", "tool_list"
]
ANSWERS_KEYS = [
    "answers", "answer", "calls", "tool_calls", "gold_calls", "gold_actions",
    "target", "gold", "gold_call"
]

def _safe_json_load(x):
    if isinstance(x, (dict, list)): return x
    if pd.isna(x): return None
    if isinstance(x, (int, float, bool)): return x
    try:
        return json.loads(x)
    except Exception:
        return None

def _extract_query(row: Dict[str, Any]) -> Optional[str]:
    for k in QUERY_KEYS:
        v = row.get(k)
        if isinstance(v, str) and v.strip():
            return v
    return None

def _extract_tools(row: Dict[str, Any]) -> Optional[list]:
    # 1) 优先从常见的工具列取
    for k in TOOLS_KEYS:
        if k in row:
            v = row.get(k)
            v = v if isinstance(v, (list, dict)) else _safe_json_load(v)
            if isinstance(v, list):
                return v
            if isinstance(v, dict):
                # dict 形式：按 name 展开
                return [{"name": kk, **(vv if isinstance(vv, dict) else {})} for kk, vv in v.items()]
    # 2) 兜底：如果这行只有一个 gold 的 tool_name，也至少提供一个最小工具清单
    tn = row.get("tool_name") or row.get("function_name") or row.get("name")
    if isinstance(tn, str) and tn.strip():
        return [{"name": tn.strip()}]
    return None

def _normalize_call_obj(x) -> Optional[dict]:
    if not isinstance(x, dict): return None
    name = x.get("tool_name") or x.get("name") or x.get("function_name")
    args = x.get("arguments") or x.get("args") or x.get("parameters") or {}
    if not isinstance(args, dict):
        try:
            args = json.loads(args)
            if not isinstance(args, dict):
                args = {"_raw": args}
        except Exception:
            args = {"_raw": args}
    if name:
        return {"name": str(name), "arguments": args}
    return None

def _extract_gold_call(row: Dict[str, Any]) -> Optional[dict]:
    # 1) 先看 answers/calls 类字段
    for k in ANSWERS_KEYS:
        if k in row:
            val = row.get(k)
            val = val if isinstance(val, (list, dict)) else _safe_json_load(val)
            if isinstance(val, list) and val:
                c = _normalize_call_obj(val[0])
                if c: return c
            if isinstance(val, dict):
                c = _normalize_call_obj(val)
                if c: return c
    # 2) minpeter/parsed 常见：单行拆分为 tool_name + arguments
    tn = row.get("tool_name") or row.get("function_name") or row.get("name")
    if tn:
        args = row.get("arguments") or row.get("args") or row.get("parameters") or {}
        args = args if isinstance(args, dict) else (_safe_json_load(args) or {})
        if not isinstance(args, dict):
            args = {"_raw": args}
        return {"name": str(tn), "arguments": args}
    return None

def load_source(repo: str, split: str) -> pd.DataFrame:
    if os.path.exists(RAW_PATH):
        print(f"[normalize] Reading local raw: {RAW_PATH}")
        return pd.read_csv(RAW_PATH)
    if load_dataset is None:
        raise RuntimeError("datasets.load_dataset not available and no local raw found.")
    print(f"[normalize] Local raw not found. Loading from HF: {repo}:{split}")
    ds = load_dataset(repo, split=split)
    return ds.to_pandas()

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--repo", default="Salesforce/xlam-function-calling-60k")
    parser.add_argument("--split", default="train")
    parser.add_argument("--out", default=OUT_PATH)
    # 关键：在 notebook 里可能会注入未知参数 (-f=...); 用 parse_known_args 忽略
    args, _ = parser.parse_known_args()

    os.makedirs(os.path.dirname(args.out), exist_ok=True)
    df = load_source(args.repo, args.split)
    print(f"[normalize] Loaded {len(df)} raw rows")
    print(f"[normalize] Columns: {list(df.columns)[:20]}{' ...' if len(df.columns)>20 else ''}")

    norm_rows = []
    for _, row_s in df.iterrows():
        row = row_s.to_dict()
        q = _extract_query(row)
        t = _extract_tools(row)
        g = _extract_gold_call(row)
        if q is None or t is None or g is None:
            continue
        norm_rows.append({
            "query": q,
            "tools": json.dumps(t, ensure_ascii=False),
            "gold_call": json.dumps(g, ensure_ascii=False),
        })

    out_df = pd.DataFrame(norm_rows, columns=["query", "tools", "gold_call"])
    print(f"[normalize] Kept {len(out_df)} normalized rows")
    out_df.to_csv(args.out, index=False, encoding="utf-8")
    print(f"[normalize] Wrote {args.out}")

if __name__ == "__main__":
    main()


[normalize] Local raw not found. Loading from HF: Salesforce/xlam-function-calling-60k:train


Generating train split: 100%|██████████| 60000/60000 [00:00<00:00, 145472.15 examples/s]


[normalize] Loaded 60000 raw rows
[normalize] Columns: ['id', 'query', 'answers', 'tools']
[normalize] Kept 60000 normalized rows
[normalize] Wrote data/processed/apigen_normalized.csv
