In [1]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import os
from pathlib import Path
import json

In [2]:

NUM_SAMPLES = 4000
OUTPUT_JSON_FILE = f"./data/input/sft_dataset_{NUM_SAMPLES}.json"

In [3]:
dataset = load_dataset("HoangCuongNguyen/CTI-to-MITRE-dataset")

In [4]:
print(f"dataset structure: {dataset}")
print(f"1st sample of train set: {dataset['train'][0]}")

dataset structure: DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 14427
    })
})
1st sample of train set: {'text': '### Human:Find the techniques and ID from MITRE ATT&CK framework.,TrickBot has used macros in Excel documents to download and deploy the malware on the user’s machine.### Assistant: T1059: Command and Scripting Interpreter'}


In [5]:
train_dataset = dataset['train']
print(f"\n已选择 'train' 数据集，共 {len(train_dataset)} 条记录。")


已选择 'train' 数据集，共 14427 条记录。


In [6]:
def process_example(example):
    """处理单个数据样本，将其从原始文本转换为结构化字典"""
    full_text = example.get('text', '').strip()
    instruction, input_text, output = "", "", ""
    try:
        parts = full_text.split('### Assistant:')
        if len(parts) == 2:
            human_part, assistant_part = parts
            output = assistant_part.strip()
            human_content = human_part.replace('### Human:', '').strip()
            instruction_input_parts = human_content.split(',', 1)
            if len(instruction_input_parts) == 2:
                instruction = instruction_input_parts[0].strip()
                input_text = instruction_input_parts[1].strip()
    except Exception as e:
        print(f"处理数据时发生错误: {e}\n原始数据: {full_text}")
    return {
        "instruction": instruction,
        "input": input_text,
        "output": output
    }

In [7]:
train_dataset=train_dataset.select(range(NUM_SAMPLES))

processed_dataset = train_dataset.map(
            process_example,
            remove_columns=train_dataset.column_names
)

print("\n处理后的数据集结构:")
print(processed_dataset)

all_records = [record for record in processed_dataset]

if len(processed_dataset) > 0:
   print("\n处理结果示例 (第一条):")
   print(json.dumps(processed_dataset[0], indent=4, ensure_ascii=False))


try:
    print(f"准备将文件保存到: {OUTPUT_JSON_FILE}")

    # 使用 Python 自带的 json 库进行写入
    with open(OUTPUT_JSON_FILE, 'w', encoding='utf-8') as f:
            # json.dump 会将整个列表一次性写入，确保是单一顶层数组
        json.dump(all_records, f, indent=4, ensure_ascii=False)

    print(f"\n处理完成！数据集已通过手动方式正确保存到 {OUTPUT_JSON_FILE}")

        # 验证文件内容
    print("\n正在验证已保存的文件...")
    with open(OUTPUT_JSON_FILE, 'r', encoding='utf-8') as f:
        first_char = f.read(1)
        f.seek(0, 2)  # 移动到文件末尾
        file_size = f.tell()
        f.seek(file_size - 1)
        last_char = f.read(1)
        if first_char == '[' and last_char == ']':
            print("✅ 文件验证成功：文件以 '[' 开头，以 ']' 结尾。格式正确！")
        else:
            print("❌ 文件验证失败：文件格式仍然不正确。")

except Exception as e:
    print(f"写入文件时发生错误: {e}")


处理后的数据集结构:
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 4000
})

处理结果示例 (第一条):
{
    "instruction": "Find the techniques and ID from MITRE ATT&CK framework.",
    "input": "TrickBot has used macros in Excel documents to download and deploy the malware on the user’s machine.",
    "output": "T1059: Command and Scripting Interpreter"
}
准备将文件保存到: ./data/input/sft_dataset_4000.json

处理完成！数据集已通过手动方式正确保存到 ./data/input/sft_dataset_4000.json

正在验证已保存的文件...
✅ 文件验证成功：文件以 '[' 开头，以 ']' 结尾。格式正确！


In [8]:




print("\n处理完成！数据集已成功保存。")


处理完成！数据集已成功保存。
