# 提取特征值

In [7]:
import ijson
import os
import csv
from tqdm import tqdm

In [8]:
input_dir = "json_data"
output_file = "dataset.csv"

In [9]:
def extract_sni(layers):
    if (
        "tls" not in layers
        or "tls.record" not in layers["tls"]
        or "tls.handshake" not in layers["tls"]["tls.record"]
    ):
        return None

    handshake = layers["tls"]["tls.record"]["tls.handshake"]
    if not isinstance(handshake, dict):
        return None

    for extension_key, extension_value in handshake.items():
        if (
            extension_key.startswith("Extension: server_name")
            and "Server Name Indication extension" in extension_value
        ):
            return extension_value["Server Name Indication extension"][
                "tls.handshake.extensions_server_name"
            ]

    return None

In [10]:
# 获取所有 JSON 文件的列表
json_files = [
    os.path.join(input_dir, file)
    for file in os.listdir(input_dir)
    if file.endswith(".json")
]

In [11]:
# 写入CSV文件头
with open(output_file, "w", newline="", encoding="utf-8-sig") as csvfile:
    fieldnames = ["classes", "filename", "stream_index", "sni"] + [
        f"len{i+1}" for i in range(30)
    ]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

In [12]:
# 处理每个 JSON 文件
for json_file in json_files:
    file_size = os.path.getsize(json_file)
    tls_packets = []
    sni_dict = {}
    tcp_len_dict = {}

    with open(json_file, "r", encoding="utf-8") as f:
        objects = ijson.items(f, "item")
        with tqdm(
            total=file_size,
            desc=f"Processing {os.path.basename(json_file)}",
            unit="B",
            unit_scale=True,
        ) as pbar:
            for packet in objects:
                pbar.update(f.tell() - pbar.n)
                layers = packet.get("_source", {}).get("layers", {})
                if not layers or "tcp" not in layers:
                    continue

                stream_index = layers["tcp"].get("tcp.stream")
                if stream_index is None:
                    continue

                if "tls" in layers:
                    sni = extract_sni(layers)
                    if sni:
                        sni_dict.setdefault(stream_index, set()).add(sni)

                tcp_len = int(layers["tcp"].get("tcp.len", 0))
                tcp_len_dict.setdefault(stream_index, []).append(tcp_len)

        # 每次处理完一个文件后，写入并清空内容
        with open(output_file, "a", newline="", encoding="utf-8-sig") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            for stream_index, sni_set in sni_dict.items():
                tcp_lengths = tcp_len_dict.get(stream_index, [])
                # 限制为 30 个元素，少于 30 个则用 0 填充
                tcp_lengths = (tcp_lengths[:30] + [0] * 30)[:30]
                row = {
                    "classes": os.path.splitext(os.path.basename(json_file))[0],
                    "filename": os.path.basename(json_file),
                    "stream_index": stream_index,
                    "sni": ", ".join(sni_set),
                }
                row.update(
                    {f"len{i+1}": length for i, length in enumerate(tcp_lengths)}
                )
                writer.writerow(row)

        # 清空字典
        tls_packets.clear()
        sni_dict.clear()
        tcp_len_dict.clear()

print("CSV file created successfully.")

Processing www.news.cn.json:   0%|          | 0.00/6.25G [00:00<?, ?B/s]

Processing www.news.cn.json: 100%|██████████| 6.25G/6.25G [00:26<00:00, 236MB/s]
Processing www.qimao.com.json: 100%|██████████| 132M/132M [00:00<00:00, 192MB/s] 
Processing dxy.com.json: 100%|██████████| 58.8M/58.8M [00:00<00:00, 165MB/s]
Processing bbs.elecfans.com.json: 100%|██████████| 35.6M/35.6M [00:00<00:00, 210MB/s]
Processing www.7k7k.com.json: 100%|██████████| 1.48G/1.48G [00:06<00:00, 236MB/s]
Processing bbs.kanxue.com.json: 100%|██████████| 9.28G/9.28G [00:42<00:00, 218MB/s]
Processing my.4399.com.json: 100%|██████████| 2.60G/2.60G [00:10<00:00, 254MB/s]
Processing www.haodf.com.json: 100%|██████████| 83.8M/83.8M [00:00<00:00, 161MB/s]
Processing blog.csdn.net.json: 100%|██████████| 157M/157M [00:00<00:00, 176MB/s] 
Processing www.3dmgame.com.json: 100%|██████████| 570k/570k [00:00<00:00, 94.9MB/s]


CSV file created successfully.
