In [3]:
from datasets import load_from_disk

# 读取 arrow 文件为 HuggingFace Dataset
dataset = load_from_disk("/home/ud202281368/jupyterlab/GraphLLM-dev/GTG/data/dataset/hf_dataset/msra/BFS-int_id")

# 查看数据前几条
print(dataset['dev'][0])

{'task': 'BFS', 'graph': '[(1, 2), (1, 8), (1, 4), (1, 6), (1, 9), (1, 5), (2, 6), (2, 9), (8, 6), (8, 7), (8, 0), (4, 8), (6, 3), (6, 1), (6, 0), (9, 1), (9, 2), (9, 4), (5, 1), (5, 2), (5, 4), (0, 6), (7, 2), (7, 8), (7, 4), (7, 6)]', 'graph_adj': '{1: [2, 8, 4, 6, 9, 5],\n2: [6, 9],\n8: [6, 7, 0],\n4: [8],\n6: [3, 1, 0],\n9: [1, 2, 4],\n5: [1, 2, 4],\n0: [6],\n7: [2, 8, 4, 6],\n3: []}', 'graph_nl': 'Node 1 is connected to nodes 2, 8, 4, 6, 9, 5.\nNode 2 is connected to nodes 6, 9.\nNode 8 is connected to nodes 6, 7, 0.\nNode 4 is connected to node 8.\nNode 6 is connected to nodes 3, 1, 0.\nNode 9 is connected to nodes 1, 2, 4.\nNode 5 is connected to nodes 1, 2, 4.\nNode 0 is connected to node 6.\nNode 7 is connected to nodes 2, 8, 4, 6.', 'num_nodes': 10, 'num_edges': 26, 'directed': True, 'question': 'Start from node 1, output a sequence of traversal in breadth-first search (BFS) order.', 'answer': '[1, 2, 8, 4, 6, 9, 5, 7, 0, 3]', 'steps': "Let's run BFS step by step.\nInitial st

# in-domain datasets

In [None]:
import os
import json
from datasets import load_from_disk


type_1 = ["cycle-int_id", "connectivity-int_id", "edge-int_id"]  # 判断类型

type_2 = ["BFS-int_id", "connected_component-int_id", "DFS-int_id", "euler_path-int_id", "hamiltonian_path-int_id", "neighbor-int_id", "predecessor-int_id", "shortest_path-int_id", "topological_sort-int_id"]  # 遍历类型

type_3 = ["clustering_coefficient-int_id", "common_neighbor-int_id", "degree-int_id", "diameter-int_id", "jaccard-int_id", "maximum_flow-int_id", "MST-int_id", "page_rank-int_id"]# 数字类型

type_4 = ["bipartite-int_id"]  # 配对类型


def convert_splits_to_json(dataset_path, output_dir, dataset):
    """
    Converts a Hugging Face Arrow dataset with train/dev/test splits
    into JSON files containing instruction/output pairs.
    """
    # Load dataset from local disk
    ds_dict = load_from_disk(dataset_path)
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Iterate over each split
    for split_name, split_ds in ds_dict.items():
        records = []
        for record in split_ds:
            # Determine graph type
            graph_type = 'directed' if record.get('directed', False) else 'undirected'

            if dataset in type_1:
                example = "if your answer is Yes, please output <<<Yes>>> only."
            elif dataset in type_2:
                example = "if your answer is [4,2,5,0], please output <<<[4,2,5,0]>>> only."
            elif dataset in type_3:
                example = "if your answer is 0.3, please output <<<0.3>>> only."
            elif dataset in type_4:
                example = "if your answer is (0, 4), (1, 8), (2, 3), please output <<<[(0, 4), (1, 8), (2, 3)]>>> only."
            else:
                raise NotImplementedError
            
            # Construct instruction string
            instruction = f"The following is a question related to the graph reasoning task {dataset.split('-')[0].upper()}. Please answer the question and format your final response as <<<ANSWER>>>. For example, {example} \n"
            instruction += (
                f"Given a {graph_type} graph with the following connections:\n"
                f"{record['graph_nl']}\n"
                f"{record['question']}"
            )
            
            # Output is the 'answer' field
            output = '<<<' + str(record['answer']) + '>>>'
            
            # Append to list
            records.append({
                "instruction": instruction,
                "input": "",
                "output": output,
                "id": record['id']
            })
            if not str(output):
                print(1)
        
        # Write to JSON file
        out_file = os.path.join(output_dir, f"{split_name}.json")
        with open(out_file, 'w', encoding='utf-8') as f:
            json.dump(records, f, ensure_ascii=False, indent=2)
        
        print(f"Saved {len(records)} records to {out_file}")



In [30]:
folders = [
    "BFS-int_id",
    "bipartite-int_id",
    "clustering_coefficient-int_id",
    "common_neighbor-int_id",
    "connected_component-int_id",
    "connectivity-int_id",
    "cycle-int_id",
    "degree-int_id",
    "DFS-int_id",
    "diameter-int_id",
    "edge-int_id",
    "euler_path-int_id",
    "hamiltonian_path-int_id",
    "jaccard-int_id",
    "maximum_flow-int_id",
    "MST-int_id",
    "neighbor-int_id",
    "page_rank-int_id",
    "predecessor-int_id",
    "shortest_path-int_id",
    "topological_sort-int_id",
]

print(len(folders))

for dataset in folders:
    dataset_path = f'msra/{dataset}'
    output_dir = f'graphinstruct/{dataset}'
    convert_splits_to_json(dataset_path, output_dir, dataset)



21
Saved 800 records to graphinstruct/BFS-int_id/train.json
Saved 100 records to graphinstruct/BFS-int_id/dev.json
Saved 100 records to graphinstruct/BFS-int_id/test.json
Saved 800 records to graphinstruct/bipartite-int_id/train.json
Saved 100 records to graphinstruct/bipartite-int_id/dev.json
Saved 100 records to graphinstruct/bipartite-int_id/test.json
Saved 800 records to graphinstruct/clustering_coefficient-int_id/train.json
Saved 100 records to graphinstruct/clustering_coefficient-int_id/dev.json
Saved 100 records to graphinstruct/clustering_coefficient-int_id/test.json
Saved 800 records to graphinstruct/common_neighbor-int_id/train.json
Saved 100 records to graphinstruct/common_neighbor-int_id/dev.json
Saved 100 records to graphinstruct/common_neighbor-int_id/test.json
Saved 800 records to graphinstruct/connected_component-int_id/train.json
Saved 100 records to graphinstruct/connected_component-int_id/dev.json
Saved 100 records to graphinstruct/connected_component-int_id/test.jso

# in-domain Ground Truth

In [24]:
import os
import json
import csv
from datasets import load_from_disk

def convert_splits_to_csv(dataset_path, output_dir, dataset):
    """
    Converts a Hugging Face Arrow dataset with train/dev/test splits
    into JSON files containing instruction/output pairs.
    """
    # Load dataset from local disk
    ds_dict = load_from_disk(dataset_path)
    for split_name, split_ds in ds_dict.items():
        col_name = (list(split_ds.features.keys())[:-1])


    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    csv_file = os.path.join(output_dir, f'{dataset}.csv')
    
    # Iterate over each split
    with open(csv_file, 'w', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(col_name)  # header
        for split_name, split_ds in ds_dict.items():
            records = []
            for record in split_ds: 
                writer.writerow([record[col] for col in col_name])

for dataset in folders:
    dataset_path = f'msra/{dataset}'
    output_dir = f'graphinstruct/{dataset}'
    convert_splits_to_csv(dataset_path, output_dir, dataset)


# OOD Graph description languages

In [32]:
import os
import json
from datasets import load_from_disk


type_1 = ["cycle-int_id", "connectivity-int_id", "edge-int_id"]  # 判断类型

type_2 = ["BFS-int_id", "connected_component-int_id", "DFS-int_id", "euler_path-int_id", "hamiltonian_path-int_id", "neighbor-int_id", "predecessor-int_id", "shortest_path-int_id", "topological_sort-int_id"]  # 遍历类型

type_3 = ["clustering_coefficient-int_id", "common_neighbor-int_id", "degree-int_id", "diameter-int_id", "jaccard-int_id", "maximum_flow-int_id", "MST-int_id", "page_rank-int_id"]# 数字类型

type_4 = ["bipartite-int_id"]  # 配对类型


def convert_splits_to_json_gdl(dataset_path, output_dir, dataset, gdl):
    """
    Converts a Hugging Face Arrow dataset with train/dev/test splits
    into JSON files containing instruction/output pairs.
    """
    # Load dataset from local disk
    ds_dict = load_from_disk(dataset_path)
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Iterate over each split
    for split_name, split_ds in ds_dict.items():
        if split_name == 'test':
            records = []
            for record in split_ds:
                # Determine graph type
                graph_type = 'directed' if record.get('directed', False) else 'undirected'
    
                if dataset in type_1:
                    example = "if your answer is Yes, please output <<<Yes>>> only."
                elif dataset in type_2:
                    example = "if your answer is [4,2,5,0], please output <<<[4,2,5,0]>>> only."
                elif dataset in type_3:
                    example = "if your answer is 0.3, please output <<<0.3>>> only."
                elif dataset in type_4:
                    example = "if your answer is (0, 4), (1, 8), (2, 3), please output <<<[(0, 4), (1, 8), (2, 3)]>>> only."
                else:
                    raise NotImplementedError
                if gdl == 'adj':
                    graph = record['graph_adj']
                elif gdl == 'el':
                    graph = record['graph']
                
                # Construct instruction string
                instruction = f"The following is a question related to the graph reasoning task {dataset.split('-')[0].upper()}. Please answer the question and format your final response as <<<ANSWER>>>. For example, {example} \n"
                instruction += (
                    f"Given a {graph_type} graph with the following connections:\n"
                    f"{graph}\n"
                    f"{record['question']}"
                )
                
                # Output is the 'answer' field
                output = '<<<' + str(record['answer']) + '>>>'
                
                # Append to list
                records.append({
                    "instruction": instruction,
                    "input": "",
                    "output": output,
                    "id": record['id']
                })
                if not str(output):
                    print(1)
            
            # Write to JSON file
            out_file = os.path.join(output_dir, f"{split_name}.json")
            with open(out_file, 'w', encoding='utf-8') as f:
                json.dump(records, f, ensure_ascii=False, indent=2)
            
            print(f"Saved {len(records)} records to {out_file}")


folders = [
    "BFS-int_id",
    "bipartite-int_id",
    "clustering_coefficient-int_id",
    "common_neighbor-int_id",
    "connected_component-int_id",
    "connectivity-int_id",
    "cycle-int_id",
    "degree-int_id",
    "DFS-int_id",
    "diameter-int_id",
    "edge-int_id",
    "euler_path-int_id",
    "hamiltonian_path-int_id",
    "jaccard-int_id",
    "maximum_flow-int_id",
    "MST-int_id",
    "neighbor-int_id",
    "page_rank-int_id",
    "predecessor-int_id",
    "shortest_path-int_id",
    "topological_sort-int_id",
]

for dataset in folders:
    dataset_path = f'msra/{dataset}'
    output_dir = f'generalization/edgelist/{dataset}'
    convert_splits_to_json_gdl(dataset_path, output_dir, dataset, gdl='el')

    output_dir = f'generalization/adj/{dataset}'
    convert_splits_to_json_gdl(dataset_path, output_dir, dataset, gdl='adj')



Saved 100 records to generalization/edgelist/BFS-int_id/test.json
Saved 100 records to generalization/adj/BFS-int_id/test.json
Saved 100 records to generalization/edgelist/bipartite-int_id/test.json
Saved 100 records to generalization/adj/bipartite-int_id/test.json
Saved 100 records to generalization/edgelist/clustering_coefficient-int_id/test.json
Saved 100 records to generalization/adj/clustering_coefficient-int_id/test.json
Saved 100 records to generalization/edgelist/common_neighbor-int_id/test.json
Saved 100 records to generalization/adj/common_neighbor-int_id/test.json
Saved 100 records to generalization/edgelist/connected_component-int_id/test.json
Saved 100 records to generalization/adj/connected_component-int_id/test.json
Saved 100 records to generalization/edgelist/connectivity-int_id/test.json
Saved 100 records to generalization/adj/connectivity-int_id/test.json
Saved 100 records to generalization/edgelist/cycle-int_id/test.json
Saved 100 records to generalization/adj/cycle-

# OOD letter id

In [37]:
import os
import json
import csv

# 四类任务类型
type_1 = ["cycle-letter_id", "connectivity-letter_id", "edge-letter_id"]  # 判断类型
type_2 = ["BFS-letter_id", "connected_component-letter_id", "DFS-letter_id", "euler_path-letter_id",
          "hamiltonian_path-letter_id", "neighbor-letter_id", "predecessor-letter_id",
          "shortest_path-letter_id", "topological_sort-letter_id"]  # 遍历类型
type_3 = ["clustering_coefficient-letter_id", "common_neighbor-letter_id", "degree-letter_id",
          "diameter-letter_id", "jaccard-letter_id", "maximum_flow-letter_id", "MST-letter_id",
          "page_rank-letter_id"]  # 数字类型
type_4 = ["bipartite-letter_id"]  # 配对类型

def convert_csv_to_json_letter(input_csv_path, output_dir, dataset):
    """
    Converts a CSV file with graph reasoning test data into JSON for instruction tuning.
    """
    os.makedirs(output_dir, exist_ok=True)
    records = []

    # 判断类型样例
    if dataset in type_1:
        example = "if your answer is Yes, please output <<<Yes>>> only."
    elif dataset in type_2:
        example = "if your answer is [KET, NER, OXR, ISY], please output <<<[KET, NER, OXR, ISY]>>> only."
    elif dataset in type_3:
        example = "if your answer is 0.3, please output <<<0.3>>> only."
    elif dataset in type_4:
        example = "if your answer is (KUE, IRW), (ODS, YTR), (PIA, LNV), please output <<<[(KUE, IRW), (ODS, YTR), (PIA, LNV)]>>> only."
    else:
        raise NotImplementedError(f"Unknown dataset type: {dataset}")

    with open(input_csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            graph_type = 'directed' if row.get('directed', '').strip().lower() == 'true' else 'undirected'

            instruction = f"The following is a question related to the graph reasoning task {dataset.split('-')[0].upper()}. Please answer the question and format your final response as <<<ANSWER>>>. For example, {example} \n"
            instruction += (
                f"Given a {graph_type} graph with the following connections:\n"
                f"{row['graph_nl']}\n"
                f"{row['question']}"
            )
            output = f"<<<{row['answer']}>>>"
            records.append({
                "instruction": instruction,
                "input": "",
                "output": output,
                "id": int(row['id']) if 'id' in row else None
            })

    out_file = os.path.join(output_dir, "test.json")
    with open(out_file, 'w', encoding='utf-8') as f:
        json.dump(records, f, ensure_ascii=False, indent=2)

    print(f"✅ Saved {len(records)} records to {out_file}")

# 批量处理
folders = [
    "BFS-letter_id", "bipartite-letter_id", "clustering_coefficient-letter_id", "common_neighbor-letter_id",
    "connected_component-letter_id", "connectivity-letter_id", "cycle-letter_id", "degree-letter_id",
    "DFS-letter_id", "diameter-letter_id", "edge-letter_id", "euler_path-letter_id",
    "hamiltonian_path-letter_id", "jaccard-letter_id", "maximum_flow-letter_id", "MST-letter_id",
    "neighbor-letter_id", "page_rank-letter_id", "predecessor-letter_id", "shortest_path-letter_id",
    "topological_sort-letter_id",
]

for dataset in folders:
    input_csv = f"/home/ud202281368/jupyterlab/LLaMA-Factory/data/graphinstruct/{dataset}/{dataset}.csv"  # 假设所有CSV文件都存在于这个目录
    output_dir = f"/home/ud202281368/jupyterlab/LLaMA-Factory/data/graphinstruct/{dataset}"
    convert_csv_to_json_letter(input_csv, output_dir, dataset)


✅ Saved 100 records to /home/ud202281368/jupyterlab/LLaMA-Factory/data/graphinstruct/BFS-letter_id/test.json
✅ Saved 100 records to /home/ud202281368/jupyterlab/LLaMA-Factory/data/graphinstruct/bipartite-letter_id/test.json
✅ Saved 100 records to /home/ud202281368/jupyterlab/LLaMA-Factory/data/graphinstruct/clustering_coefficient-letter_id/test.json
✅ Saved 100 records to /home/ud202281368/jupyterlab/LLaMA-Factory/data/graphinstruct/common_neighbor-letter_id/test.json
✅ Saved 100 records to /home/ud202281368/jupyterlab/LLaMA-Factory/data/graphinstruct/connected_component-letter_id/test.json
✅ Saved 100 records to /home/ud202281368/jupyterlab/LLaMA-Factory/data/graphinstruct/connectivity-letter_id/test.json
✅ Saved 100 records to /home/ud202281368/jupyterlab/LLaMA-Factory/data/graphinstruct/cycle-letter_id/test.json
✅ Saved 100 records to /home/ud202281368/jupyterlab/LLaMA-Factory/data/graphinstruct/degree-letter_id/test.json
✅ Saved 100 records to /home/ud202281368/jupyterlab/LLaMA-Fac

# OOD Graph Size

In [40]:
import os
import json
from datasets import load_from_disk


type_1 = ["cycle-int_id", "connectivity-int_id", "edge-int_id"]  # 判断类型

type_2 = ["BFS-int_id", "connected_component-int_id", "DFS-int_id", "euler_path-int_id", "hamiltonian_path-int_id", "neighbor-int_id", "predecessor-int_id", "shortest_path-int_id", "topological_sort-int_id"]  # 遍历类型

type_3 = ["clustering_coefficient-int_id", "common_neighbor-int_id", "degree-int_id", "diameter-int_id", "jaccard-int_id", "maximum_flow-int_id", "MST-int_id", "page_rank-int_id"]# 数字类型

type_4 = ["bipartite-int_id"]  # 配对类型


def convert_csv_to_json_size(input_csv_path, output_dir, dataset):
    """
    Converts a CSV file with graph reasoning test data into JSON for instruction tuning.
    """
    os.makedirs(output_dir, exist_ok=True)
    records = []

    # 判断类型样例
    if dataset in type_1:
        example = "if your answer is Yes, please output <<<Yes>>> only."
    elif dataset in type_2:
        example = "if your answer is [4,2,5,0], please output <<<[4,2,5,0]>>> only."
    elif dataset in type_3:
        example = "if your answer is 0.3, please output <<<0.3>>> only."
    elif dataset in type_4:
        example = "if your answer is (0, 4), (1, 8), (2, 3), please output <<<[(0, 4), (1, 8), (2, 3)]>>> only."
    else:
        raise NotImplementedError

    with open(input_csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            graph_type = 'directed' if row.get('directed', '').strip().lower() == 'true' else 'undirected'

            instruction = f"The following is a question related to the graph reasoning task {dataset.split('-')[0].upper()}. Please answer the question and format your final response as <<<ANSWER>>>. For example, {example} \n"
            instruction += (
                f"Given a {graph_type} graph with the following connections:\n"
                f"{row['graph_nl']}\n"
                f"{row['question']}"
            )
            output = f"<<<{row['answer']}>>>"
            records.append({
                "instruction": instruction,
                "input": "",
                "output": output,
                "id": int(row['id']) if 'id' in row else None
            })

    out_file = os.path.join(output_dir, "test.json")
    with open(out_file, 'w', encoding='utf-8') as f:
        json.dump(records, f, ensure_ascii=False, indent=2)

    print(f"✅ Saved {len(records)} records to {out_file}")

# 批量处理
folders = [
    "BFS-int_id",
    "bipartite-int_id",
    "clustering_coefficient-int_id",
    "common_neighbor-int_id",
    "connected_component-int_id",
    "connectivity-int_id",
    "cycle-int_id",
    "degree-int_id",
    "DFS-int_id",
    "diameter-int_id",
    "edge-int_id",
    "euler_path-int_id",
    "hamiltonian_path-int_id",
    "jaccard-int_id",
    "maximum_flow-int_id",
    "MST-int_id",
    "neighbor-int_id",
    "page_rank-int_id",
    "predecessor-int_id",
    "shortest_path-int_id",
    "topological_sort-int_id",
]

for dataset in folders:
    input_csv = f"/home/ud202281368/jupyterlab/LLaMA-Factory/data/generalization/mini/{dataset}/{dataset}.csv"
    output_dir = f'generalization/mini/{dataset}'
    convert_csv_to_json_size(input_csv, output_dir, dataset)

    input_csv = f"/home/ud202281368/jupyterlab/LLaMA-Factory/data/generalization/medium/{dataset}/{dataset}.csv"
    output_dir = f'generalization/medium/{dataset}'
    convert_csv_to_json_size(input_csv, output_dir, dataset)

    input_csv = f"/home/ud202281368/jupyterlab/LLaMA-Factory/data/generalization/large/{dataset}/{dataset}.csv"
    output_dir = f'generalization/large/{dataset}'
    convert_csv_to_json_size(input_csv, output_dir, dataset)



✅ Saved 100 records to generalization/mini/BFS-int_id/test.json
✅ Saved 100 records to generalization/medium/BFS-int_id/test.json
✅ Saved 100 records to generalization/large/BFS-int_id/test.json
✅ Saved 100 records to generalization/mini/bipartite-int_id/test.json
✅ Saved 100 records to generalization/medium/bipartite-int_id/test.json
✅ Saved 100 records to generalization/large/bipartite-int_id/test.json
✅ Saved 100 records to generalization/mini/clustering_coefficient-int_id/test.json
✅ Saved 100 records to generalization/medium/clustering_coefficient-int_id/test.json
✅ Saved 100 records to generalization/large/clustering_coefficient-int_id/test.json
✅ Saved 100 records to generalization/mini/common_neighbor-int_id/test.json
✅ Saved 100 records to generalization/medium/common_neighbor-int_id/test.json
✅ Saved 100 records to generalization/large/common_neighbor-int_id/test.json
✅ Saved 100 records to generalization/mini/connected_component-int_id/test.json
✅ Saved 100 records to genera

# step datasets

In [3]:
import os
import json
import csv
from datasets import load_from_disk


type_1 = ["cycle-int_id", "connectivity-int_id", "edge-int_id"]  # 判断类型

type_2 = ["BFS-int_id", "connected_component-int_id", "DFS-int_id", "euler_path-int_id", "hamiltonian_path-int_id", "neighbor-int_id", "predecessor-int_id", "shortest_path-int_id", "topological_sort-int_id"]  # 遍历类型

type_3 = ["clustering_coefficient-int_id", "common_neighbor-int_id", "degree-int_id", "diameter-int_id", "jaccard-int_id", "maximum_flow-int_id", "MST-int_id", "page_rank-int_id"]# 数字类型

type_4 = ["bipartite-int_id"]  # 配对类型


def convert_csv_to_json_steps(input_csv_path, output_dir, dataset):
    """
    Converts a CSV file with graph reasoning test data into JSON for instruction tuning.
    """
    os.makedirs(output_dir, exist_ok=True)
    records = []

    # 判断类型样例
    if dataset in type_1:
        example = "if your answer is Yes, please format your answer as <<<Yes>>>."
    elif dataset in type_2:
        example = "if your answer is [4,2,5,0], please format your answer as <<<[4,2,5,0]>>>."
    elif dataset in type_3:
        example = "if your answer is 0.3, please format your answer as <<<0.3>>>."
    elif dataset in type_4:
        example = "if your answer is (0, 4), (1, 8), (2, 3), please format your answer as <<<[(0, 4), (1, 8), (2, 3)]>>>."
    else:
        raise NotImplementedError

    with open(input_csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            graph_type = 'directed' if row.get('directed', '').strip().lower() == 'true' else 'undirected'

            instruction = f"The following is a question related to the graph reasoning task {dataset.split('-')[0].upper()}. Please answer the question step by step and format your final response as <<<ANSWER>>>. For example, {example} \n"
            instruction += (
                f"Given a {graph_type} graph with the following connections:\n"
                f"{row['graph_nl']}\n"
                f"{row['question']}"
            )
            output = f"{row['steps']} <<<{row['answer']}>>>"
            records.append({
                "instruction": instruction,
                "input": "",
                "output": output,
                "id": int(row['id']) if 'id' in row else None
            })

    out_file = os.path.join(output_dir, "test.json")
    with open(out_file, 'w', encoding='utf-8') as f:
        json.dump(records, f, ensure_ascii=False, indent=2)

    print(f"✅ Saved {len(records)} records to {out_file}")

# 批量处理
folders = [
    "BFS-int_id",
    "bipartite-int_id",
    "clustering_coefficient-int_id",
    "common_neighbor-int_id",
    "connected_component-int_id",
    "connectivity-int_id",
    "cycle-int_id",
    "degree-int_id",
    "DFS-int_id",
    "diameter-int_id",
    "edge-int_id",
    # "euler_path-int_id",
    # "hamiltonian_path-int_id",
    "jaccard-int_id",
    "maximum_flow-int_id",
    "MST-int_id",
    "neighbor-int_id",
    "page_rank-int_id",
    "predecessor-int_id",
    "shortest_path-int_id",
    "topological_sort-int_id",
]

for dataset in folders:
    input_csv = f"/home/ud202281368/jupyterlab/LLaMA-Factory/data/reasoning/small/{dataset}/{dataset}.csv"
    output_dir = f'reasoning/small/{dataset}'
    convert_csv_to_json_steps(input_csv, output_dir, dataset)



✅ Saved 10000 records to reasoning/small/BFS-int_id/test.json
✅ Saved 10000 records to reasoning/small/bipartite-int_id/test.json
✅ Saved 10000 records to reasoning/small/clustering_coefficient-int_id/test.json
✅ Saved 10000 records to reasoning/small/common_neighbor-int_id/test.json
✅ Saved 10000 records to reasoning/small/connected_component-int_id/test.json
✅ Saved 10000 records to reasoning/small/connectivity-int_id/test.json
✅ Saved 10000 records to reasoning/small/cycle-int_id/test.json
✅ Saved 10000 records to reasoning/small/degree-int_id/test.json
✅ Saved 10000 records to reasoning/small/DFS-int_id/test.json
✅ Saved 10000 records to reasoning/small/diameter-int_id/test.json
✅ Saved 10000 records to reasoning/small/edge-int_id/test.json
✅ Saved 10000 records to reasoning/small/jaccard-int_id/test.json
✅ Saved 10000 records to reasoning/small/maximum_flow-int_id/test.json
✅ Saved 10000 records to reasoning/small/MST-int_id/test.json
✅ Saved 10000 records to reasoning/small/neigh

# in domain reasoning test

In [4]:
import os
import json
from datasets import load_from_disk


type_1 = ["cycle-int_id", "connectivity-int_id", "edge-int_id"]  # 判断类型

type_2 = ["BFS-int_id", "connected_component-int_id", "DFS-int_id", "euler_path-int_id", "hamiltonian_path-int_id", "neighbor-int_id", "predecessor-int_id", "shortest_path-int_id", "topological_sort-int_id"]  # 遍历类型

type_3 = ["clustering_coefficient-int_id", "common_neighbor-int_id", "degree-int_id", "diameter-int_id", "jaccard-int_id", "maximum_flow-int_id", "MST-int_id", "page_rank-int_id"]# 数字类型

type_4 = ["bipartite-int_id"]  # 配对类型


def convert_splits_to_json(dataset_path, output_dir, dataset):
    """
    Converts a Hugging Face Arrow dataset with train/dev/test splits
    into JSON files containing instruction/output pairs.
    """
    # Load dataset from local disk
    ds_dict = load_from_disk(dataset_path)
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Iterate over each split
    for split_name, split_ds in ds_dict.items():
        records = []
        if split_name == 'test':
            for record in split_ds:
                # Determine graph type
                graph_type = 'directed' if record.get('directed', False) else 'undirected'
    
                if dataset in type_1:
                    example = "if your answer is Yes, please format your answer as <<<Yes>>>."
                elif dataset in type_2:
                    example = "if your answer is [4,2,5,0], please format your answer as <<<[4,2,5,0]>>>."
                elif dataset in type_3:
                    example = "if your answer is 0.3, please format your answer as <<<0.3>>>."
                elif dataset in type_4:
                    example = "if your answer is (0, 4), (1, 8), (2, 3), please format your answer as <<<[(0, 4), (1, 8), (2, 3)]>>>."
                else:
                    raise NotImplementedError
                
                # Construct instruction string
                instruction = f"The following is a question related to the graph reasoning task {dataset.split('-')[0].upper()}. Please answer the question step by step and format your final response as <<<ANSWER>>>. For example, {example} \n"
                instruction += (
                    f"Given a {graph_type} graph with the following connections:\n"
                    f"{record['graph_nl']}\n"
                    f"{record['question']}"
                )
                
                output = f"{record['steps']} <<<{record['answer']}>>>"
                
                # Append to list
                records.append({
                    "instruction": instruction,
                    "input": "",
                    "output": output,
                    "id": record['id']
                })
                if not str(output):
                    print(1)
            
            # Write to JSON file
            out_file = os.path.join(output_dir, f"{split_name}.json")
            with open(out_file, 'w', encoding='utf-8') as f:
                json.dump(records, f, ensure_ascii=False, indent=2)
            
            print(f"Saved {len(records)} records to {out_file}")

folders = [
    "common_neighbor-int_id",
    "connectivity-int_id",
    "degree-int_id",
    "DFS-int_id",
    "predecessor-int_id",
    "shortest_path-int_id",
]

print(len(folders))

for dataset in folders:
    dataset_path = f'msra/{dataset}'
    output_dir = f'reasoning_test/{dataset}'
    convert_splits_to_json(dataset_path, output_dir, dataset)


6
Saved 100 records to reasoning_test/common_neighbor-int_id/test.json
Saved 100 records to reasoning_test/connectivity-int_id/test.json
Saved 100 records to reasoning_test/degree-int_id/test.json
Saved 100 records to reasoning_test/DFS-int_id/test.json
Saved 100 records to reasoning_test/predecessor-int_id/test.json
Saved 100 records to reasoning_test/shortest_path-int_id/test.json
