In [2]:
import pandas as pd
import numpy as np
from scapy.all import rdpcap, TCP
import os
import glob
from pathlib import Path
from datetime import datetime, timedelta
from tqdm import tqdm
import sys
import time

# 导入配置
sys.path.append(r'C:\Users\Echo\Desktop\modbus-detection\src')
import config

print("✓ 库导入完成")
print(f"当前时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

✓ 库导入完成
当前时间: 2026-01-08 09:50:48




In [3]:
print("=" * 60)
print("快速文件检查")
print("=" * 60)

checks = {
    'Benign PCAP': config.BENIGN_PCAP_DIR,
    'External PCAP': config.EXTERNAL_PCAP_DIR,
    'IED PCAP': config.IED_PCAP_DIR,
    'SCADA PCAP': config.SCADA_PCAP_DIR,
    'External Labels': config.EXTERNAL_LABELS_DIR,
    'IED Labels': config.IED_LABELS_DIR,
    'SCADA Labels': config.SCADA_LABELS_DIR
}

all_ok = True
for name, path in checks.items():
    exists = os.path.exists(path)
    status = "✓" if exists else "✗"
    print(f"{status} {name}: {path}")
    if not exists:
        all_ok = False

if all_ok:
    print("\n✓ 所有路径验证通过！")
else:
    print("\n✗ 有路径不存在，请检查config.py配置")

快速文件检查
✓ Benign PCAP: C:\Users\Echo\Desktop\Modbus Dataset\Modbus Dataset\benign\network-wide-pcap-capture\network-wide
✓ External PCAP: C:\Users\Echo\Desktop\Modbus Dataset\Modbus Dataset\attack\external\external-attacker\external-attacker-network-capture
✓ IED PCAP: C:\Users\Echo\Desktop\Modbus Dataset\Modbus Dataset\attack\compromised-ied\ied1b\ied1b-network-captures
✓ SCADA PCAP: C:\Users\Echo\Desktop\Modbus Dataset\Modbus Dataset\attack\compromised-scada\substation-wide-capture
✓ External Labels: C:\Users\Echo\Desktop\Modbus Dataset\Modbus Dataset\attack\external\external-attacker\attacker logs
✓ IED Labels: C:\Users\Echo\Desktop\Modbus Dataset\Modbus Dataset\attack\compromised-ied\attack logs
✓ SCADA Labels: C:\Users\Echo\Desktop\Modbus Dataset\Modbus Dataset\attack\compromised-scada\attack logs

✓ 所有路径验证通过！


In [4]:
print("\n" + "=" * 60)
print("文件数量统计")
print("=" * 60)

file_counts = {}

# PCAP文件
file_counts['benign_pcaps'] = len(glob.glob(os.path.join(config.BENIGN_PCAP_DIR, "*.pcap")))
file_counts['external_pcaps'] = len(glob.glob(os.path.join(config.EXTERNAL_PCAP_DIR, "*.pcap")))
file_counts['ied_pcaps'] = len(glob.glob(os.path.join(config.IED_PCAP_DIR, "*.pcap")))
file_counts['scada_pcaps'] = len(glob.glob(os.path.join(config.SCADA_PCAP_DIR, "*.pcap")))

# CSV标签文件
external_labels = []
for root, dirs, files in os.walk(config.EXTERNAL_LABELS_DIR):
    external_labels.extend([os.path.join(root, f) for f in files if f.endswith('.csv')])
file_counts['external_labels'] = len(external_labels)

ied_labels = []
for root, dirs, files in os.walk(config.IED_LABELS_DIR):
    ied_labels.extend([os.path.join(root, f) for f in files if f.endswith('.csv')])
file_counts['ied_labels'] = len(ied_labels)

scada_labels = []
for root, dirs, files in os.walk(config.SCADA_LABELS_DIR):
    scada_labels.extend([os.path.join(root, f) for f in files if f.endswith('.csv')])
file_counts['scada_labels'] = len(scada_labels)

print("\nPCAP文件:")
print(f"  Benign:            {file_counts['benign_pcaps']} 个")
print(f"  External Attack:   {file_counts['external_pcaps']} 个")
print(f"  Compromised-IED:   {file_counts['ied_pcaps']} 个")
print(f"  Compromised-SCADA: {file_counts['scada_pcaps']} 个")

print("\nCSV标签文件:")
print(f"  External Attack:   {file_counts['external_labels']} 个")
print(f"  Compromised-IED:   {file_counts['ied_labels']} 个")
print(f"  Compromised-SCADA: {file_counts['scada_labels']} 个")


文件数量统计

PCAP文件:
  Benign:            19 个
  External Attack:   1 个
  Compromised-IED:   6 个
  Compromised-SCADA: 18 个

CSV标签文件:
  External Attack:   6 个
  Compromised-IED:   7 个
  Compromised-SCADA: 21 个


In [4]:
print("\n" + "=" * 60)
print("攻击标签数量统计")
print("=" * 60)

print("\n正在统计External Attack标签...")
total_external = 0
for csv_file in tqdm(external_labels, desc="External"):
    try:
        df = pd.read_csv(csv_file)
        total_external += len(df)
    except: 
        pass

print(f"✓ External Attack: {total_external:,} 条")

print("\n正在统计Compromised-IED标签...")
total_ied = 0
for csv_file in tqdm(ied_labels, desc="IED"):
    try:
        df = pd.read_csv(csv_file)
        total_ied += len(df)
    except: 
        pass

print(f"✓ Compromised-IED: {total_ied:,} 条")
if total_ied < 1000:
    print("  ⚠️  数据严重不足！需要SMOTE过采样")

print("\n正在统计Compromised-SCADA标签...")
total_scada = 0
for csv_file in tqdm(scada_labels, desc="SCADA"):
    try:
        df = pd.read_csv(csv_file)
        total_scada += len(df)
    except: 
        pass

print(f"✓ Compromised-SCADA: {total_scada:,} 条")

total_labels = total_external + total_ied + total_scada
print(f"\n总攻击标签数: {total_labels:,} 条")


攻击标签数量统计

正在统计External Attack标签...


External: 100%|██████████| 6/6 [00:00<00:00, 55.41it/s]


✓ External Attack: 131,207 条

正在统计Compromised-IED标签...


IED: 100%|██████████| 7/7 [00:00<00:00, 411.34it/s]


✓ Compromised-IED: 676 条
  ⚠️  数据严重不足！需要SMOTE过采样

正在统计Compromised-SCADA标签...


SCADA: 100%|██████████| 21/21 [00:01<00:00, 14.72it/s]


✓ Compromised-SCADA: 1,288,159 条

总攻击标签数: 1,420,042 条


In [5]:
print("\n" + "=" * 60)
print("PCAP解析性能测试")
print("=" * 60)

# 选择3个文件测试
test_files = glob.glob(os.path.join(config.BENIGN_PCAP_DIR, "*.pcap"))[:3]

total_packets = 0
total_time = 0

for test_file in test_files:
    file_size_mb = os.path.getsize(test_file) / (1024**2)
    print(f"\n测试: {os.path.basename(test_file)} ({file_size_mb:.1f} MB)")
    
    start = time.time()
    packets = rdpcap(test_file)
    elapsed = time.time() - start
    
    total_packets += len(packets)
    total_time += elapsed
    
    print(f"  包数: {len(packets):,}")
    print(f"  耗时: {elapsed:.1f}秒")
    print(f"  速度: {len(packets)/elapsed:.0f} 包/秒")

# 计算平均速度
avg_speed = total_packets / total_time
print(f"\n平均解析速度: {avg_speed:.0f} 包/秒")

# 估算总时间
total_pcap_count = (file_counts['benign_pcaps'] + file_counts['external_pcaps'] + 
                   file_counts['ied_pcaps'] + file_counts['scada_pcaps'])
estimated_hours = (total_pcap_count * (total_time / len(test_files))) / 3600

print(f"\n总PCAP文件数: {total_pcap_count}")
print(f"预计总耗时: {estimated_hours:.1f} 小时")


PCAP解析性能测试

测试: network-wide-normal-14.pcap (95.4 MB)
  包数: 1,166,185
  耗时: 201.9秒
  速度: 5777 包/秒

测试: network-wide-normal-15.pcap (95.4 MB)
  包数: 1,161,111
  耗时: 278.2秒
  速度: 4173 包/秒

测试: network-wide-normal-16.pcap (95.4 MB)
  包数: 1,162,229
  耗时: 529.6秒
  速度: 2195 包/秒

平均解析速度: 3456 包/秒

总PCAP文件数: 45
预计总耗时: 4.2 小时


In [6]:
print("\n" + "=" * 60)
print("CSV标签格式检查")
print("=" * 60)

print("\n1. External Attack CSV:")
sample_ext = pd.read_csv(external_labels[0])
print(f"列: {list(sample_ext.columns)}")
print("示例数据:")
print(sample_ext.head(3))

print("\n2. Compromised-IED CSV:")
sample_ied = pd.read_csv(ied_labels[0])
print(f"列: {list(sample_ied.columns)}")
print("示例数据:")
print(sample_ied.head(3))

print("\n3. Compromised-SCADA CSV:")
sample_scada = pd.read_csv(scada_labels[0])
print(f"列: {list(sample_scada.columns)}")
print("示例数据:")
print(sample_scada.head(3))


CSV标签格式检查

1. External Attack CSV:
列: ['Timestamp', 'Attack']
示例数据:
                 Timestamp               Attack
0  2023-01-01 21:00:44.389  Recon. Range: 65535

2. Compromised-IED CSV:
列: ['Timestamp', 'TargetIP', 'Attack', 'TransactionID']
示例数据:
                 Timestamp     TargetIP                        Attack  \
0  2023-03-23 05:09:22.829  185.175.0.2  Baseline Replay: In position   
1  2023-03-23 05:30:24.984  185.175.0.2  Baseline Replay: In position   
2  2023-03-23 05:51:26.274  185.175.0.2  Baseline Replay: In position   

   TransactionID  
0              1  
1           1238  
2           2479  

3. Compromised-SCADA CSV:
列: ['Timestamp', 'TargetIP', 'Attack', 'TransactionID']
示例数据:
                 Timestamp     TargetIP  \
0  2023-03-12 16:12:27.761  185.175.0.4   
1  2023-03-12 16:12:27.773  185.175.0.4   
2  2023-03-12 16:12:27.774  185.175.0.4   

                                      Attack  TransactionID  
0  Brute force or specific coil. Address: 13           

In [8]:
print("\n" + "=" * 60)
print("采样计划")
print("=" * 60)

sampling_plan = pd.DataFrame({
    '场景': ['Benign', 'External', 'IED', 'SCADA', '总计'],
    '原始数量': [
        '数百万包',
        f'{total_external:,}',
        f'{total_ied:,}',
        f'{total_scada:,}',
        f'{total_labels:,}'
    ],
    '采样策略': [
        '随机采样',
        '全部使用',
        '全部+SMOTE',
        '分层采样',
        '-'
    ],
    '目标数量': [
        '200,000',
        f'{total_external:,}',
        '5,000',
        '200,000',
        f'{total_external + 5000 + 200000 + 200000:,}'
    ]
})

print(sampling_plan)

# 保存
config.ensure_dirs()
sampling_plan.to_csv(config.DATA_PROCESSED / "sampling_plan.csv", index=False)
print(f"\n✓ 采样计划已保存")


采样计划
         场景       原始数量      采样策略     目标数量
0    Benign       数百万包      随机采样  200,000
1  External    131,207      全部使用  131,207
2       IED        676  全部+SMOTE    5,000
3     SCADA  1,288,159      分层采样  200,000
4        总计  1,420,042         -  536,207
✓ 所有目录已创建

✓ 采样计划已保存


In [10]:
print("\n" + "=" * 60)
print("阶段1总结")
print("=" * 60)

print(f"""
✓ 数据验证完成：
  - 标签数量确认:
    • External: {total_external:,}
    • IED: {total_ied:,} ⚠️
    • SCADA: {total_scada:,}
  
✓ 性能评估完成：
  - 解析速度: {avg_speed:.0f} 包/秒
  - 预计耗时: {estimated_hours:.1f} 小时
  
⏭️  下一步：阶段2 - 快速测试版本
完成时间: {datetime.now().strftime('%H:%M:%S')}
""")

# 保存这些变量供后续使用
results = {
    'total_external': total_external,
    'total_ied': total_ied,
    'total_scada': total_scada,
    'avg_speed': avg_speed,
    'estimated_hours': estimated_hours,
    'external_labels': external_labels,
    'ied_labels': ied_labels,
    'scada_labels': scada_labels,
    'file_counts': file_counts
}

import pickle
with open(config.DATA_PROCESSED / 'stage1_results.pkl', 'wb') as f:
    pickle.dump(results, f)

print("✓ 结果已保存，可用于后续阶段")


阶段1总结

✓ 数据验证完成：
  - 标签数量确认:
    • External: 131,207
    • IED: 676 ⚠️
    • SCADA: 1,288,159
  
✓ 性能评估完成：
  - 解析速度: 3456 包/秒
  - 预计耗时: 4.2 小时
  
⏭️  下一步：阶段2 - 快速测试版本
完成时间: 09:24:49

✓ 结果已保存，可用于后续阶段


In [11]:
# 重新保存stage1结果（因为路径改了）
import pickle
import importlib

# 重新加载config
importlib.reload(config)

# 重新收集文件列表
external_labels = []
for root, dirs, files in os.walk(config.EXTERNAL_LABELS_DIR):
    external_labels.extend([os.path.join(root, f) for f in files if f.endswith('.csv')])

ied_labels = []
for root, dirs, files in os.walk(config.IED_LABELS_DIR):
    ied_labels.extend([os.path.join(root, f) for f in files if f.endswith('.csv')])

scada_labels = []
for root, dirs, files in os.walk(config.SCADA_LABELS_DIR):
    scada_labels.extend([os.path.join(root, f) for f in files if f.endswith('.csv')])

# 重新保存
results = {
    'total_external': 131207,  # 从之前的统计
    'total_ied': 676,
    'total_scada': 1288159,
    'avg_speed': 3456,
    'estimated_hours': 4.2,
    'external_labels': external_labels,
    'ied_labels': ied_labels,
    'scada_labels': scada_labels,
    'file_counts': {
        'benign_pcaps': 19,
        'external_pcaps': 1,  # 修正后只有1个
        'ied_pcaps': 6,
        'scada_pcaps': 18
    }
}

with open(config.DATA_PROCESSED / 'stage1_results.pkl', 'wb') as f:
    pickle.dump(results, f)

print("✓ Stage1结果已更新")

✓ Stage1结果已更新


In [5]:
import pickle
import importlib
import glob
import os

# 重新加载config
importlib.reload(config)

print("重新统计文件数量...")

# 重新收集CSV标签文件列表
external_labels = []
for root, dirs, files in os.walk(config.EXTERNAL_LABELS_DIR):
    external_labels.extend([os.path.join(root, f) for f in files if f.endswith('.csv')])

ied_labels = []
for root, dirs, files in os.walk(config.IED_LABELS_DIR):
    ied_labels.extend([os.path.join(root, f) for f in files if f.endswith('.csv')])

scada_labels = []
for root, dirs, files in os.walk(config.SCADA_LABELS_DIR):
    scada_labels.extend([os.path.join(root, f) for f in files if f.endswith('.csv')])

# 重新统计PCAP文件数量（不要硬编码！）
benign_pcaps_count = len(glob.glob(os.path.join(config.BENIGN_PCAP_DIR, "*.pcap")))
external_pcaps_count = len(glob.glob(os.path.join(config.EXTERNAL_PCAP_DIR, "*.pcap")))
ied_pcaps_count = len(glob.glob(os.path.join(config.IED_PCAP_DIR, "*.pcap")))
scada_pcaps_count = len(glob.glob(os.path.join(config.SCADA_PCAP_DIR, "*.pcap")))

print(f"Benign PCAP: {benign_pcaps_count} 个")
print(f"External PCAP: {external_pcaps_count} 个")
print(f"IED PCAP: {ied_pcaps_count} 个")
print(f"SCADA PCAP: {scada_pcaps_count} 个")

print(f"\nExternal CSV: {len(external_labels)} 个")
print(f"IED CSV: {len(ied_labels)} 个")
print(f"SCADA CSV: {len(scada_labels)} 个")

# 重新保存（使用实际统计的数字）
results = {
    'total_external': 131207,  # 标签数量（从之前统计）
    'total_ied': 676,
    'total_scada': 1288159,
    'avg_speed': 3456,
    'estimated_hours': 4.2,
    'external_labels': external_labels,
    'ied_labels': ied_labels,
    'scada_labels': scada_labels,
    'file_counts': {
        'benign_pcaps': benign_pcaps_count,      # 实际统计
        'external_pcaps': external_pcaps_count,  # 实际统计
        'ied_pcaps': ied_pcaps_count,            # 实际统计
        'scada_pcaps': scada_pcaps_count         # 实际统计
    }
}

with open(config.DATA_PROCESSED / 'stage1_results.pkl', 'wb') as f:
    pickle.dump(results, f)

print("\n✓ Stage1结果已更新")
print(f"已保存到: {config.DATA_PROCESSED / 'stage1_results.pkl'}")

重新统计文件数量...
Benign PCAP: 19 个
External PCAP: 1 个
IED PCAP: 6 个
SCADA PCAP: 18 个

External CSV: 6 个
IED CSV: 7 个
SCADA CSV: 21 个

✓ Stage1结果已更新
已保存到: C:\Users\Echo\Desktop\modbus-detection\data\processed\stage1_results.pkl


In [6]:
print("=" * 60)
print("深度诊断：External PCAP vs CSV 时间戳")
print("=" * 60)

# 1. 读取External PCAP的时间戳
from scapy.all import rdpcap, TCP
from datetime import datetime, timedelta
import pandas as pd

external_pcap = r"C:\Users\Echo\Desktop\Modbus Dataset\Modbus Dataset\attack\external\external-attacker\external-attacker-network-capture\veth665f3cf-0.pcap"

print("\n1. 读取PCAP时间戳（前10个Modbus包）:")
packets = rdpcap(external_pcap, count=1000)

pcap_times = []
for pkt in packets:
    if TCP in pkt and (pkt[TCP].sport == 502 or pkt[TCP].dport == 502):
        try:
            payload = bytes(pkt[TCP].payload)
            if len(payload) >= 8:
                ts_raw = datetime.fromtimestamp(float(pkt.time))
                ts_plus3 = ts_raw + timedelta(hours=3)
                pcap_times.append({
                    'raw': ts_raw,
                    'plus3': ts_plus3
                })
                if len(pcap_times) >= 10:
                    break
        except:
            pass

print("\nPCAP时间戳（前10个）:")
for i, t in enumerate(pcap_times[:10], 1):
    print(f"  {i}. 原始: {t['raw']}")
    print(f"      +3h:  {t['plus3']}")

# 2. 读取CSV标签时间戳
import glob
external_csvs = []
for root, dirs, files in os.walk(config.EXTERNAL_LABELS_DIR):
    external_csvs.extend([os.path.join(root, f) for f in files if f.endswith('.csv')])

print(f"\n2. 读取CSV标签时间戳（从{len(external_csvs)}个文件）:")

all_csv_times = []
for csv_file in external_csvs:
    df = pd.read_csv(csv_file)
    for ts in df['Timestamp']:
        try:
            parsed = pd.to_datetime(ts)
            all_csv_times.append(parsed)
        except:
            pass

# 显示CSV时间范围
if all_csv_times:
    all_csv_times_sorted = sorted(all_csv_times)
    print(f"\nCSV时间戳范围:")
    print(f"  最早: {all_csv_times_sorted[0]}")
    print(f"  最晚: {all_csv_times_sorted[-1]}")
    print(f"  总数: {len(all_csv_times)} 条")
    
    print(f"\n前10条CSV时间戳:")
    for i, t in enumerate(all_csv_times_sorted[:10], 1):
        print(f"  {i}. {t}")

# 3. 计算时间差
print("\n3. 时间差分析:")
if pcap_times and all_csv_times:
    pcap_first = pcap_times[0]['raw']
    pcap_first_plus3 = pcap_times[0]['plus3']
    csv_first = all_csv_times_sorted[0]
    
    diff_raw = abs((pcap_first - csv_first).total_seconds())
    diff_plus3 = abs((pcap_first_plus3 - csv_first).total_seconds())
    
    print(f"PCAP第一个包（原始）: {pcap_first}")
    print(f"PCAP第一个包（+3h）:  {pcap_first_plus3}")
    print(f"CSV第一条标签:        {csv_first}")
    print(f"\n时间差（原始）: {diff_raw:.1f} 秒 = {diff_raw/3600:.1f} 小时")
    print(f"时间差（+3h）:  {diff_plus3:.1f} 秒 = {diff_plus3/3600:.1f} 小时")
    
    # 尝试不同的时区转换
    print("\n4. 尝试不同的时区转换:")
    for hours in [-3, -2, -1, 0, 1, 2, 3, 4, 5, 6]:
        adjusted = pcap_first + timedelta(hours=hours)
        diff = abs((adjusted - csv_first).total_seconds())
        marker = "  ✓✓✓" if diff < 60 else ""
        print(f"  {hours:+2d}h: {adjusted} → 时间差 {diff:.1f}秒 {marker}")

# 5. 检查PCAP和CSV的日期是否匹配
print("\n5. 日期匹配检查:")
if pcap_times and all_csv_times:
    pcap_dates = set([t['raw'].date() for t in pcap_times])
    csv_dates = set([t.date() for t in all_csv_times_sorted])
    
    print(f"PCAP日期: {sorted(pcap_dates)}")
    print(f"CSV日期:  {sorted(csv_dates)}")
    
    common_dates = pcap_dates & csv_dates
    if common_dates:
        print(f"✓ 有重叠日期: {sorted(common_dates)}")
    else:
        print(f"✗ 没有重叠日期！PCAP和CSV可能来自不同时间段")

深度诊断：External PCAP vs CSV 时间戳

1. 读取PCAP时间戳（前10个Modbus包）:

PCAP时间戳（前10个）:
  1. 原始: 2023-02-01 11:51:37.648697
      +3h:  2023-02-01 14:51:37.648697
  2. 原始: 2023-02-01 11:52:46.776096
      +3h:  2023-02-01 14:52:46.776096
  3. 原始: 2023-02-01 11:58:25.357685
      +3h:  2023-02-01 14:58:25.357685
  4. 原始: 2023-02-01 11:59:28.607726
      +3h:  2023-02-01 14:59:28.607726
  5. 原始: 2023-02-01 12:01:29.782177
      +3h:  2023-02-01 15:01:29.782177
  6. 原始: 2023-02-01 12:01:29.782251
      +3h:  2023-02-01 15:01:29.782251
  7. 原始: 2023-02-01 12:01:29.782330
      +3h:  2023-02-01 15:01:29.782330
  8. 原始: 2023-02-01 12:01:29.782382
      +3h:  2023-02-01 15:01:29.782382
  9. 原始: 2023-02-01 12:01:29.782451
      +3h:  2023-02-01 15:01:29.782451
  10. 原始: 2023-02-01 12:01:29.782499
      +3h:  2023-02-01 15:01:29.782499

2. 读取CSV标签时间戳（从6个文件）:

CSV时间戳范围:
  最早: 2022-12-29 06:06:21.239000
  最晚: 2023-02-01 13:22:13.415000
  总数: 131207 条

前10条CSV时间戳:
  1. 2022-12-29 06:06:21.239000
  2. 2022-12-30

In [None]:
print("=" * 60)
print("精确对比：2023-02-01 的 PCAP vs CSV")
print("=" * 60)

from datetime import date

# 1. 提取2月1日的PCAP时间戳（所有Modbus包）
target_date = date(2023, 2, 1)

print("\n1. 提取2月1日的所有PCAP Modbus包...")
external_pcap = r"C:\Users\Echo\Desktop\Modbus Dataset\Modbus Dataset\attack\external\external-attacker\external-attacker-network-capture\veth665f3cf-0.pcap"

packets = rdpcap(external_pcap)
print(f"总包数: {len(packets)}")

pcap_times_0201 = []
for pkt in packets:
    if TCP in pkt and (pkt[TCP].sport == 502 or pkt[TCP].dport == 502):
        try:
            payload = bytes(pkt[TCP].payload)
            if len(payload) >= 8:
                ts_raw = datetime.fromtimestamp(float(pkt.time))
                
                # 只要2月1日的
                if ts_raw.date() == target_date:
                    ts_plus3 = ts_raw + timedelta(hours=3)
                    pcap_times_0201.append({
                        'timestamp': ts_raw,
                        'timestamp_utc': ts_plus3,
                        'txid': int.from_bytes(payload[0:2], byteorder='big'),
                        'dst_ip': pkt['IP'].dst
                    })
        except:
            pass

print(f"✓ 2月1日的Modbus包: {len(pcap_times_0201)} 个")

if len(pcap_times_0201) > 0:
    times_sorted = sorted([t['timestamp'] for t in pcap_times_0201])
    print(f"\nPCAP时间范围（原始）:")
    print(f"  最早: {times_sorted[0]}")
    print(f"  最晚: {times_sorted[-1]}")
    
    times_sorted_utc = sorted([t['timestamp_utc'] for t in pcap_times_0201])
    print(f"\nPCAP时间范围（+3h）:")
    print(f"  最早: {times_sorted_utc[0]}")
    print(f"  最晚: {times_sorted_utc[-1]}")

# 2. 提取2月1日的CSV标签
print("\n2. 提取2月1日的CSV标签...")

csv_0201 = []
for csv_file in external_csvs:
    df = pd.read_csv(csv_file)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
    df = df.dropna(subset=['Timestamp'])
    
    # 只要2月1日的
    df_0201 = df[df['Timestamp'].dt.date == target_date]
    
    if len(df_0201) > 0:
        print(f"  文件: {os.path.basename(csv_file)} - {len(df_0201)} 条")
        csv_0201.append(df_0201)

if csv_0201:
    df_csv_0201 = pd.concat(csv_0201, ignore_index=True)
    print(f"\n✓ 2月1日的CSV标签: {len(df_csv_0201)} 条")
    
    print(f"\nCSV时间范围:")
    print(f"  最早: {df_csv_0201['Timestamp'].min()}")
    print(f"  最晚: {df_csv_0201['Timestamp'].max()}")
    
    # 3. 精确匹配测试
    print("\n" + "=" * 60)
    print("3. 精确匹配测试（2月1日数据）")
    print("=" * 60)
    
    # 创建PCAP DataFrame
    df_pcap_0201 = pd.DataFrame(pcap_times_0201)
    
    # 测试不同的时间窗口和时区组合
    print("\n测试不同配置:")
    
    for tz_hours in [-3, 0, 3]:
        for time_window in [1, 5, 10, 30]:
            df_pcap_test = df_pcap_0201.copy()
            
            if tz_hours == 0:
                df_pcap_test['ts'] = df_pcap_test['timestamp']
            else:
                df_pcap_test['ts'] = df_pcap_test['timestamp'] + timedelta(hours=tz_hours)
            
            matched = 0
            for _, label in df_csv_0201.iterrows():
                time_mask = abs((df_pcap_test['ts'] - label['Timestamp']).dt.total_seconds()) <= time_window
                if time_mask.sum() > 0:
                    matched += 1
            
            match_rate = matched / len(df_csv_0201) * 100
            marker = "  ✓✓✓" if match_rate > 10 else ""
            print(f"  时区{tz_hours:+2d}h, 窗口±{time_window:2d}s: {matched:5d}/{len(df_csv_0201)} = {match_rate:5.1f}%{marker}")
    
    # 4. 显示具体的匹配示例
    print("\n" + "=" * 60)
    print("4. 查看具体的时间戳对比（前5条CSV）")
    print("=" * 60)
    
    for i, (_, label) in enumerate(df_csv_0201.head(5).iterrows(), 1):
        print(f"\nCSV标签 #{i}:")
        print(f"  时间: {label['Timestamp']}")
        print(f"  攻击: {label['Attack']}")
        
        # 找最接近的PCAP包（原始时间）
        df_pcap_test = df_pcap_0201.copy()
        df_pcap_test['time_diff'] = abs((df_pcap_test['timestamp'] - label['Timestamp']).dt.total_seconds())
        closest = df_pcap_test.nsmallest(3, 'time_diff')
        
        print(f"  最接近的PCAP包（原始时间）:")
        for j, (_, p) in enumerate(closest.iterrows(), 1):
            print(f"    {j}. {p['timestamp']} (差{p['time_diff']:.1f}秒)")
        
        # 找最接近的PCAP包（+3h时间）
        df_pcap_test['time_diff'] = abs((df_pcap_test['timestamp_utc'] - label['Timestamp']).dt.total_seconds())
        closest = df_pcap_test.nsmallest(3, 'time_diff')
        
        print(f"  最接近的PCAP包（+3h时间）:")
        for j, (_, p) in enumerate(closest.iterrows(), 1):
            print(f"    {j}. {p['timestamp_utc']} (差{p['time_diff']:.1f}秒)")

else:
    print("✗ 没有2月1日的CSV标签")

精确对比：2023-02-01 的 PCAP vs CSV

1. 提取2月1日的所有PCAP Modbus包...
