In [1]:
import os
import glob
from pathlib import Path
import pandas as pd
import sys

sys.path.append(r'C:\Users\Echo\Desktop\modbus-detection\src')
import config

print("=" * 60)
print("排查1：数据集目录结构分析")
print("=" * 60)

dataset_root = config.DATASET_ROOT

# 递归扫描所有PCAP和CSV文件
all_files = {
    'pcap': [],
    'csv': []
}

for root, dirs, files in os.walk(dataset_root):
    for file in files:
        full_path = os.path.join(root, file)
        rel_path = os.path.relpath(full_path, dataset_root)
        
        if file.endswith('.pcap'):
            all_files['pcap'].append({
                'file': file,
                'path': rel_path,
                'directory': os.path.dirname(rel_path),
                'size_mb': os.path.getsize(full_path) / (1024**2)
            })
        elif file.endswith('.csv'):
            all_files['csv'].append({
                'file': file,
                'path': rel_path,
                'directory': os.path.dirname(rel_path)
            })

print(f"\n总PCAP文件数: {len(all_files['pcap'])}")
print(f"总CSV文件数: {len(all_files['csv'])}")

# 创建DataFrame便于分析
df_pcap = pd.DataFrame(all_files['pcap'])
df_csv = pd.DataFrame(all_files['csv'])

print("\n" + "=" * 60)
print("PCAP文件按目录分组:")
print("=" * 60)
pcap_by_dir = df_pcap.groupby('directory').agg({
    'file': 'count',
    'size_mb': 'sum'
}).rename(columns={'file': 'count', 'size_mb': 'total_size_mb'})
print(pcap_by_dir.to_string())

print("\n" + "=" * 60)
print("CSV文件按目录分组:")
print("=" * 60)
csv_by_dir = df_csv.groupby('directory')['file'].count()
print(csv_by_dir.to_string())

排查1：数据集目录结构分析

总PCAP文件数: 170
总CSV文件数: 34

PCAP文件按目录分组:
                                                                       count  total_size_mb
directory                                                                                  
attack\compromised-ied\central-agent\central-agent-network-captures        5     459.408956
attack\compromised-ied\ied1a\ied1a-network-captures                        6     516.835071
attack\compromised-ied\ied1b\ied1b-network-captures                        6     495.868903
attack\compromised-ied\ied4c\ied4c-network-captures                        6     517.878103
attack\compromised-ied\trust-scada-hmi\trust-scada-network-captures       20    1902.950409
attack\compromised-scada\central-agent\central-agent-network-captures      2     123.989984
attack\compromised-scada\ied1a\ied1a-network-captures                      6     520.922754
attack\compromised-scada\ied1b\ied1b-network-captures                      8     680.314793
attack\compromised-scada\

In [2]:
print("\n" + "=" * 60)
print("排查2：检查文件命名异常")
print("=" * 60)

# 检查External目录下的PCAP文件
external_pcaps = df_pcap[df_pcap['directory'].str.contains('external', case=False, na=False)]

print(f"\nExternal目录下的PCAP文件 ({len(external_pcaps)} 个):")
print("-" * 60)

suspicious_files = []

for idx, row in external_pcaps.iterrows():
    filename = row['file']
    path = row['path']
    
    # 检查是否包含"normal"关键字（应该在benign目录，不应该在external）
    is_suspicious = 'normal' in filename.lower()
    
    marker = "⚠️ 可疑" if is_suspicious else "✓"
    print(f"{marker} {filename}")
    print(f"   路径: {path}")
    print(f"   大小: {row['size_mb']:.1f} MB")
    
    if is_suspicious:
        suspicious_files.append({
            'file': filename,
            'path': path,
            'reason': '文件名包含"normal"但在External目录'
        })
    print()

if suspicious_files:
    print("\n" + "=" * 60)
    print(f"⚠️  发现 {len(suspicious_files)} 个可疑文件:")
    print("=" * 60)
    for f in suspicious_files:
        print(f"文件: {f['file']}")
        print(f"路径: {f['path']}")
        print(f"问题: {f['reason']}")
        print()
else:
    print("\n✓ 未发现文件组织异常")


排查2：检查文件命名异常

External目录下的PCAP文件 (8 个):
------------------------------------------------------------
✓ veth460b141-0.pcap
   路径: attack\external\central-agent\veth460b141-0.pcap
   大小: 2.6 MB

✓ veth665f3cf-0.pcap
   路径: attack\external\external-attacker\external-attacker-network-capture\veth665f3cf-0.pcap
   大小: 43.9 MB

✓ veth4edc015-0.pcap
   路径: attack\external\ied1a\veth4edc015-0.pcap
   大小: 77.1 MB

✓ vethd9e14c0-0.pcap
   路径: attack\external\ied1b\vethd9e14c0-0.pcap
   大小: 29.7 MB

✓ veth8bc3408-0.pcap
   路径: attack\external\ied4c\veth8bc3408-0.pcap
   大小: 33.2 MB

⚠️ 可疑 network-wide-normal-0.pcap
   路径: attack\external\network-wide\network-wide-normal-0.pcap
   大小: 95.4 MB

⚠️ 可疑 network-wide-normal-1.pcap
   路径: attack\external\network-wide\network-wide-normal-1.pcap
   大小: 44.6 MB

✓ veth5bbeaa2-0.pcap
   路径: attack\external\scada-hmi\veth5bbeaa2-0.pcap
   大小: 93.6 MB


⚠️  发现 2 个可疑文件:
文件: network-wide-normal-0.pcap
路径: attack\external\network-wide\network-wide-normal-0.pcap

In [3]:
print("\n" + "=" * 60)
print("排查3：文件命名一致性分析")
print("=" * 60)

# 分析文件命名模式
import re

naming_patterns = {
    'veth开头': [],
    'substation开头': [],
    'network-wide开头': [],
    'ied开头': [],
    '其他': []
}

for idx, row in df_pcap.iterrows():
    filename = row['file']
    
    if filename.startswith('veth'):
        naming_patterns['veth开头'].append(row)
    elif filename.startswith('substation'):
        naming_patterns['substation开头'].append(row)
    elif filename.startswith('network-wide'):
        naming_patterns['network-wide开头'].append(row)
    elif filename.startswith('ied'):
        naming_patterns['ied开头'].append(row)
    else:
        naming_patterns['其他'].append(row)

print("\n文件命名模式统计:")
print("-" * 60)
for pattern, files in naming_patterns.items():
    if files:
        print(f"\n{pattern}: {len(files)} 个文件")
        # 显示示例
        for f in files[:3]:
            print(f"  - {f['file']} ({f['directory']})")
        if len(files) > 3:
            print(f"  ... 还有 {len(files)-3} 个")

print("\n" + "=" * 60)
print("命名一致性评估:")
print("=" * 60)

print(f"""
观察结果:
1. veth开头 ({len(naming_patterns['veth开头'])} 个): Docker虚拟网卡命名，NIC捕获
2. substation开头 ({len(naming_patterns['substation开头'])} 个): 全网捕获
3. network-wide开头 ({len(naming_patterns['network-wide开头'])} 个): 全网捕获
4. 其他 ({len(naming_patterns['其他'])} 个): 需要检查

✓ 命名混乱程度: {'高' if len(naming_patterns['其他']) > 5 else '中' if len(naming_patterns['其他']) > 0 else '低'}
""")


排查3：文件命名一致性分析

文件命名模式统计:
------------------------------------------------------------

veth开头: 131 个文件
  - vethffb308b-0.pcap (attack\compromised-ied\central-agent\central-agent-network-captures)
  - vethffb308b-1.pcap (attack\compromised-ied\central-agent\central-agent-network-captures)
  - vethffb308b-2.pcap (attack\compromised-ied\central-agent\central-agent-network-captures)
  ... 还有 128 个

substation开头: 18 个文件
  - substation-0.pcap (attack\compromised-scada\substation-wide-capture)
  - substation-1.pcap (attack\compromised-scada\substation-wide-capture)
  - substation-10.pcap (attack\compromised-scada\substation-wide-capture)
  ... 还有 15 个

network-wide开头: 21 个文件
  - network-wide-normal-0.pcap (attack\external\network-wide)
  - network-wide-normal-1.pcap (attack\external\network-wide)
  - network-wide-normal-14.pcap (benign\network-wide-pcap-capture\network-wide)
  ... 还有 18 个

命名一致性评估:

观察结果:
1. veth开头 (131 个): Docker虚拟网卡命名，NIC捕获
2. substation开头 (18 个): 全网捕获
3. network-wide开头 (2

In [4]:
print("\n" + "=" * 60)
print("排查4：CSV标签质量对比分析")
print("=" * 60)

# 读取不同场景的CSV示例
csv_samples = {}

# External
external_csv_path = glob.glob(os.path.join(config.EXTERNAL_LABELS_DIR, "**", "*.csv"), recursive=True)
if external_csv_path:
    csv_samples['External'] = pd.read_csv(external_csv_path[0])

# IED
ied_csv_path = glob.glob(os.path.join(config.IED_LABELS_DIR, "**", "*.csv"), recursive=True)
if ied_csv_path:
    csv_samples['IED'] = pd.read_csv(ied_csv_path[0])

# SCADA
scada_csv_path = glob.glob(os.path.join(config.SCADA_LABELS_DIR, "**", "*.csv"), recursive=True)
if scada_csv_path:
    csv_samples['SCADA'] = pd.read_csv(scada_csv_path[0])

# 对比列结构
print("\nCSV标签列结构对比:")
print("-" * 60)

comparison = []

for scenario, df in csv_samples.items():
    comparison.append({
        '场景': scenario,
        '列数': len(df.columns),
        '列名': ', '.join(df.columns),
        '有Timestamp': 'Timestamp' in df.columns,
        '有TargetIP': 'TargetIP' in df.columns,
        '有TransactionID': 'TransactionID' in df.columns,
        '有Attack': 'Attack' in df.columns,
        '行数': len(df)
    })

df_comparison = pd.DataFrame(comparison)
print(df_comparison.to_string(index=False))

print("\n" + "=" * 60)
print("标签质量评估:")
print("=" * 60)

for scenario, df in csv_samples.items():
    print(f"\n{scenario}:")
    
    # 必需列检查
    required_cols = ['Timestamp', 'TargetIP', 'TransactionID', 'Attack']
    missing_cols = [col for col in required_cols if col not in df.columns]
    
    if missing_cols:
        print(f"  ⚠️  缺少列: {', '.join(missing_cols)}")
    else:
        print(f"  ✓ 所有必需列完整")
    
    # 显示示例
    print(f"  前3行示例:")
    print(df.head(3).to_string(index=False))


排查4：CSV标签质量对比分析

CSV标签列结构对比:
------------------------------------------------------------
      场景  列数                                         列名  有Timestamp  有TargetIP  有TransactionID  有Attack     行数
External   2                          Timestamp, Attack        True      False           False     True      1
     IED   4 Timestamp, TargetIP, Attack, TransactionID        True       True            True     True     66
   SCADA   4 Timestamp, TargetIP, Attack, TransactionID        True       True            True     True 107770

标签质量评估:

External:
  ⚠️  缺少列: TargetIP, TransactionID
  前3行示例:
              Timestamp              Attack
2023-01-01 21:00:44.389 Recon. Range: 65535

IED:
  ✓ 所有必需列完整
  前3行示例:
              Timestamp    TargetIP                       Attack  TransactionID
2023-03-23 05:09:22.829 185.175.0.2 Baseline Replay: In position              1
2023-03-23 05:30:24.984 185.175.0.2 Baseline Replay: In position           1238
2023-03-23 05:51:26.274 185.175.0.2 Baseline R

In [5]:
print("\n" + "=" * 60)
print("排查5：IP地址和设备映射验证")
print("=" * 60)

# 官方文档中的IP映射
official_ip_mapping = {
    '185.175.0.4': 'IED1A (Secure)',
    '185.175.0.8': 'IED4C (Secure)',
    '185.175.0.5': 'IED1B (Normal)',
    '185.175.0.2': 'SCADA HMI (Secure)',
    '185.175.0.3': 'SCADA HMI (Normal)',
    '185.175.0.6': 'Central Agent',
    '185.175.0.7': 'Attacker'
}

print("\n官方IP映射:")
print("-" * 60)
for ip, device in official_ip_mapping.items():
    print(f"  {ip} → {device}")

# 从PCAP中提取实际出现的IP
print("\n从PCAP中验证IP地址...")
from scapy.all import rdpcap, IP

# 读取一个较大的PCAP样本
test_pcap = glob.glob(os.path.join(config.BENIGN_PCAP_DIR, "*.pcap"))[0]
print(f"测试文件: {os.path.basename(test_pcap)}")

packets = rdpcap(test_pcap, count=10000)

observed_ips = set()
for pkt in packets:
    if IP in pkt:
        observed_ips.add(pkt[IP].src)
        observed_ips.add(pkt[IP].dst)

print(f"\n观察到的IP地址 ({len(observed_ips)} 个):")
print("-" * 60)

for ip in sorted(observed_ips):
    device = official_ip_mapping.get(ip, "❓ 未知设备")
    marker = "✓" if ip in official_ip_mapping else "⚠️"
    print(f"{marker} {ip:15s} → {device}")

# 检查是否有未知IP
unknown_ips = observed_ips - set(official_ip_mapping.keys())
if unknown_ips:
    print(f"\n⚠️  发现 {len(unknown_ips)} 个未记录的IP地址:")
    for ip in unknown_ips:
        print(f"  - {ip}")
else:
    print("\n✓ 所有IP地址都在官方映射中")


排查5：IP地址和设备映射验证

官方IP映射:
------------------------------------------------------------
  185.175.0.4 → IED1A (Secure)
  185.175.0.8 → IED4C (Secure)
  185.175.0.5 → IED1B (Normal)
  185.175.0.2 → SCADA HMI (Secure)
  185.175.0.3 → SCADA HMI (Normal)
  185.175.0.6 → Central Agent
  185.175.0.7 → Attacker

从PCAP中验证IP地址...




测试文件: network-wide-normal-14.pcap

观察到的IP地址 (5 个):
------------------------------------------------------------
✓ 185.175.0.3     → SCADA HMI (Normal)
✓ 185.175.0.4     → IED1A (Secure)
✓ 185.175.0.5     → IED1B (Normal)
✓ 185.175.0.6     → Central Agent
✓ 185.175.0.8     → IED4C (Secure)

✓ 所有IP地址都在官方映射中


In [6]:
print("\n" + "=" * 60)
print("从CSV标签验证TargetIP")
print("=" * 60)

# 统计CSV中出现的TargetIP
target_ips = {}

for scenario, df in csv_samples.items():
    if 'TargetIP' in df.columns:
        ips = df['TargetIP'].value_counts()
        target_ips[scenario] = ips
        
        print(f"\n{scenario} CSV中的TargetIP分布:")
        print("-" * 60)
        for ip, count in ips.items():
            device = official_ip_mapping.get(ip, "❓ 未知")
            print(f"  {ip} → {device:25s} ({count:,} 次)")
    else:
        print(f"\n{scenario}: ⚠️  没有TargetIP列")


从CSV标签验证TargetIP

External: ⚠️  没有TargetIP列

IED CSV中的TargetIP分布:
------------------------------------------------------------
  185.175.0.2 → SCADA HMI (Secure)        (66 次)

SCADA CSV中的TargetIP分布:
------------------------------------------------------------
  185.175.0.8 → IED4C (Secure)            (49,644 次)
  185.175.0.4 → IED1A (Secure)            (33,192 次)
  185.175.0.5 → IED1B (Normal)            (24,934 次)


In [7]:
print("\n" + "=" * 60)
print("综合诊断报告")
print("=" * 60)

issues = []

# 问题1: 文件组织
if suspicious_files:
    issues.append({
        'severity': '高',
        'category': '文件组织',
        'issue': f'发现 {len(suspicious_files)} 个Benign文件在External目录',
        'impact': 'External PCAP可能包含正常流量，需要过滤',
        'solution': '使用IP地址过滤，只保留src_ip=185.175.0.7的包'
    })

# 问题2: 命名不一致
if len(naming_patterns['其他']) > 0:
    issues.append({
        'severity': '中',
        'category': '命名规范',
        'issue': f'发现 {len(naming_patterns["其他"])} 个非标准命名的文件',
        'impact': '文件识别困难，需要手动检查',
        'solution': '建立文件名→捕获类型的映射表'
    })

# 问题3: CSV标签不完整
if 'External' in csv_samples:
    ext_cols = csv_samples['External'].columns.tolist()
    if 'TargetIP' not in ext_cols or 'TransactionID' not in ext_cols:
        issues.append({
            'severity': '高',
            'category': 'CSV标签质量',
            'issue': 'External的CSV缺少TargetIP和TransactionID列',
            'impact': '无法使用三元组匹配，只能用时间窗口匹配',
            'solution': '采用IP识别+宽松时间窗口（10秒）'
        })

# 问题4: IP地址未知
if unknown_ips:
    issues.append({
        'severity': '中',
        'category': 'IP映射',
        'issue': f'发现 {len(unknown_ips)} 个未记录的IP地址',
        'impact': '可能是外部网络地址或网关，需要识别',
        'solution': '调查这些IP的角色，更新映射表'
    })

if not issues:
    print("\n✅ 未发现严重问题，数据集质量良好")
else:
    print(f"\n⚠️  发现 {len(issues)} 个问题:")
    print("\n")
    
    for i, issue in enumerate(issues, 1):
        print(f"问题 {i}:")
        print(f"  严重程度: {issue['severity']}")
        print(f"  类别: {issue['category']}")
        print(f"  问题描述: {issue['issue']}")
        print(f"  影响: {issue['impact']}")
        print(f"  解决方案: {issue['solution']}")
        print()

# 保存诊断报告
import json

report = {
    'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'total_pcap_files': len(df_pcap),
    'total_csv_files': len(df_csv),
    'suspicious_files': suspicious_files,
    'issues': issues
}

report_path = config.DATA_PROCESSED / "dataset_quality_report.json"
with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(report, f, indent=2, ensure_ascii=False)

print(f"✓ 诊断报告已保存: {report_path}")


综合诊断报告

⚠️  发现 2 个问题:


问题 1:
  严重程度: 高
  类别: 文件组织
  问题描述: 发现 2 个Benign文件在External目录
  影响: External PCAP可能包含正常流量，需要过滤
  解决方案: 使用IP地址过滤，只保留src_ip=185.175.0.7的包

问题 2:
  严重程度: 高
  类别: CSV标签质量
  问题描述: External的CSV缺少TargetIP和TransactionID列
  影响: 无法使用三元组匹配，只能用时间窗口匹配
  解决方案: 采用IP识别+宽松时间窗口（10秒）

✓ 诊断报告已保存: C:\Users\Echo\Desktop\modbus-detection\data\processed\dataset_quality_report.json
