In [1]:
import os
import glob
from pathlib import Path
import pandas as pd
import sys

sys.path.append(r'C:\Users\Echo\Desktop\modbus-detection\src')
import config

print("=" * 60)
print("Investigation 1: Dataset Directory Structure Analysis")
print("=" * 60)

dataset_root = config.DATASET_ROOT

#  Recursively scan all PCAP and CSV files
all_files = {
    'pcap': [],
    'csv': []
}

for root, dirs, files in os.walk(dataset_root):
    for file in files:
        full_path = os.path.join(root, file)
        rel_path = os.path.relpath(full_path, dataset_root)
        
        if file.endswith('.pcap'):
            all_files['pcap'].append({
                'file': file,
                'path': rel_path,
                'directory': os.path.dirname(rel_path),
                'size_mb': os.path.getsize(full_path) / (1024**2)
            })
        elif file.endswith('.csv'):
            all_files['csv'].append({
                'file': file,
                'path': rel_path,
                'directory': os.path.dirname(rel_path)
            })

print(f"\nTotal PCAP files: {len(all_files['pcap'])}")
print(f"Total CSV files: {len(all_files['csv'])}")

#  Create DataFrame for analysis
df_pcap = pd.DataFrame(all_files['pcap'])
df_csv = pd.DataFrame(all_files['csv'])

print("\n" + "=" * 60)
print("PCAP files grouped by directory:")
print("=" * 60)
pcap_by_dir = df_pcap.groupby('directory').agg({
    'file': 'count',
    'size_mb': 'sum'
}).rename(columns={'file': 'count', 'size_mb': 'total_size_mb'})
print(pcap_by_dir.to_string())

print("\n" + "=" * 60)
print("CSV files grouped by directory:")
print("=" * 60)
csv_by_dir = df_csv.groupby('directory')['file'].count()
print(csv_by_dir.to_string())

排查1：数据集目录结构分析

总PCAP文件数: 170
总CSV文件数: 34

PCAP文件按目录分组:
                                                                       count  total_size_mb
directory                                                                                  
attack\compromised-ied\central-agent\central-agent-network-captures        5     459.408956
attack\compromised-ied\ied1a\ied1a-network-captures                        6     516.835071
attack\compromised-ied\ied1b\ied1b-network-captures                        6     495.868903
attack\compromised-ied\ied4c\ied4c-network-captures                        6     517.878103
attack\compromised-ied\trust-scada-hmi\trust-scada-network-captures       20    1902.950409
attack\compromised-scada\central-agent\central-agent-network-captures      2     123.989984
attack\compromised-scada\ied1a\ied1a-network-captures                      6     520.922754
attack\compromised-scada\ied1b\ied1b-network-captures                      8     680.314793
attack\compromised-scada\

In [2]:
print("\n" + "=" * 60)
print("Investigation 2: Check File Naming Anomalies")
print("=" * 60)

#  Check PCAP files in External directory
external_pcaps = df_pcap[df_pcap['directory'].str.contains('external', case=False, na=False)]

print(f"\nPCAP files in External directory ({len(external_pcaps)} files):")
print("-" * 60)

suspicious_files = []

for idx, row in external_pcaps.iterrows():
    filename = row['file']
    path = row['path']
    
    #  Check if filename contains "normal" keyword (should be in benign directory, not external)
    is_suspicious = 'normal' in filename.lower()
    
    marker = "⚠️ Suspicious" if is_suspicious else "✓"
    print(f"{marker} {filename}")
    print(f"   Path: {path}")
    print(f"   Size: {row['size_mb']:.1f} MB")
    
    if is_suspicious:
        suspicious_files.append({
            'file': filename,
            'path': path,
            'reason': 'Filename contains "normal" but in External directory'
        })
    print()

if suspicious_files:
    print("\n" + "=" * 60)
    print(f"⚠️  Found {len(suspicious_files)} suspicious files:")
    print("=" * 60)
    for f in suspicious_files:
        print(f"File: {f['file']}")
        print(f"Path: {f['path']}")
        print(f"Issue: {f['reason']}")
        print()
else:
    print("\n✓ No file organization anomalies found")


排查2：检查文件命名异常

External目录下的PCAP文件 (8 个):
------------------------------------------------------------
✓ veth460b141-0.pcap
   路径: attack\external\central-agent\veth460b141-0.pcap
   大小: 2.6 MB

✓ veth665f3cf-0.pcap
   路径: attack\external\external-attacker\external-attacker-network-capture\veth665f3cf-0.pcap
   大小: 43.9 MB

✓ veth4edc015-0.pcap
   路径: attack\external\ied1a\veth4edc015-0.pcap
   大小: 77.1 MB

✓ vethd9e14c0-0.pcap
   路径: attack\external\ied1b\vethd9e14c0-0.pcap
   大小: 29.7 MB

✓ veth8bc3408-0.pcap
   路径: attack\external\ied4c\veth8bc3408-0.pcap
   大小: 33.2 MB

⚠️ 可疑 network-wide-normal-0.pcap
   路径: attack\external\network-wide\network-wide-normal-0.pcap
   大小: 95.4 MB

⚠️ 可疑 network-wide-normal-1.pcap
   路径: attack\external\network-wide\network-wide-normal-1.pcap
   大小: 44.6 MB

✓ veth5bbeaa2-0.pcap
   路径: attack\external\scada-hmi\veth5bbeaa2-0.pcap
   大小: 93.6 MB


⚠️  发现 2 个可疑文件:
文件: network-wide-normal-0.pcap
路径: attack\external\network-wide\network-wide-normal-0.pcap

In [3]:
print("\n" + "=" * 60)
print("Investigation 3: File Naming Consistency Analysis")
print("=" * 60)

#  Analyze file naming patterns
import re

naming_patterns = {
    'veth prefix': [],
    'substation prefix': [],
    'network-wide prefix': [],
    'ied prefix': [],
    'other': []
}

for idx, row in df_pcap.iterrows():
    filename = row['file']
    
    if filename.startswith('veth'):
        naming_patterns['veth prefix'].append(row)
    elif filename.startswith('substation'):
        naming_patterns['substation prefix'].append(row)
    elif filename.startswith('network-wide'):
        naming_patterns['network-wide prefix'].append(row)
    elif filename.startswith('ied'):
        naming_patterns['ied prefix'].append(row)
    else:
        naming_patterns['other'].append(row)

print("\nFile naming pattern statistics:")
print("-" * 60)
for pattern, files in naming_patterns.items():
    if files:
        print(f"\n{pattern}: {len(files)} files")
        #  Show examples
        for f in files[:3]:
            print(f"  - {f['file']} ({f['directory']})")
        if len(files) > 3:
            print(f"  ... and {len(files)-3} more")

print("\n" + "=" * 60)
print("Naming consistency assessment:")
print("=" * 60)

print(f"""
Observations:
1. veth prefix ({len(naming_patterns['veth prefix'])} files): Docker virtual NIC naming, NIC capture
2. substation prefix ({len(naming_patterns['substation prefix'])} files): Network-wide capture
3. network-wide prefix ({len(naming_patterns['network-wide prefix'])} files): Network-wide capture
4. other ({len(naming_patterns['other'])} files): Need to check

✓ Naming confusion level: {'High' if len(naming_patterns['other']) > 5 else 'Medium' if len(naming_patterns['other']) > 0 else 'Low'}
""")


排查3：文件命名一致性分析

文件命名模式统计:
------------------------------------------------------------

veth开头: 131 个文件
  - vethffb308b-0.pcap (attack\compromised-ied\central-agent\central-agent-network-captures)
  - vethffb308b-1.pcap (attack\compromised-ied\central-agent\central-agent-network-captures)
  - vethffb308b-2.pcap (attack\compromised-ied\central-agent\central-agent-network-captures)
  ... 还有 128 个

substation开头: 18 个文件
  - substation-0.pcap (attack\compromised-scada\substation-wide-capture)
  - substation-1.pcap (attack\compromised-scada\substation-wide-capture)
  - substation-10.pcap (attack\compromised-scada\substation-wide-capture)
  ... 还有 15 个

network-wide开头: 21 个文件
  - network-wide-normal-0.pcap (attack\external\network-wide)
  - network-wide-normal-1.pcap (attack\external\network-wide)
  - network-wide-normal-14.pcap (benign\network-wide-pcap-capture\network-wide)
  ... 还有 18 个

命名一致性评估:

观察结果:
1. veth开头 (131 个): Docker虚拟网卡命名，NIC捕获
2. substation开头 (18 个): 全网捕获
3. network-wide开头 (2

In [4]:
print("\n" + "=" * 60)
print("Investigation 4: CSV Label Quality Comparison Analysis")
print("=" * 60)

#  Read CSV examples from different scenarios
csv_samples = {}

#  External
external_csv_path = glob.glob(os.path.join(config.EXTERNAL_LABELS_DIR, "**", "*.csv"), recursive=True)
if external_csv_path:
    csv_samples['External'] = pd.read_csv(external_csv_path[0])

#  IED
ied_csv_path = glob.glob(os.path.join(config.IED_LABELS_DIR, "**", "*.csv"), recursive=True)
if ied_csv_path:
    csv_samples['IED'] = pd.read_csv(ied_csv_path[0])

#  SCADA
scada_csv_path = glob.glob(os.path.join(config.SCADA_LABELS_DIR, "**", "*.csv"), recursive=True)
if scada_csv_path:
    csv_samples['SCADA'] = pd.read_csv(scada_csv_path[0])

#  Compare column structures
print("\nCSV label column structure comparison:")
print("-" * 60)

comparison = []

for scenario, df in csv_samples.items():
    comparison.append({
        'Scenario': scenario,
        'Columns': len(df.columns),
        'Column Names': ', '.join(df.columns),
        'Has Timestamp': 'Timestamp' in df.columns,
        'Has TargetIP': 'TargetIP' in df.columns,
        'Has TransactionID': 'TransactionID' in df.columns,
        'Has Attack': 'Attack' in df.columns,
        'Rows': len(df)
    })

df_comparison = pd.DataFrame(comparison)
print(df_comparison.to_string(index=False))

print("\n" + "=" * 60)
print("Label quality assessment:")
print("=" * 60)

for scenario, df in csv_samples.items():
    print(f"\n{scenario}:")
    
    #  Required column check
    required_cols = ['Timestamp', 'TargetIP', 'TransactionID', 'Attack']
    missing_cols = [col for col in required_cols if col not in df.columns]
    
    if missing_cols:
        print(f"  ⚠️  Missing columns: {', '.join(missing_cols)}")
    else:
        print(f"  ✓ All required columns present")
    
    #  Show examples
    print(f"  First 3 rows:")
    print(df.head(3).to_string(index=False))


排查4：CSV标签质量对比分析

CSV标签列结构对比:
------------------------------------------------------------
      场景  列数                                         列名  有Timestamp  有TargetIP  有TransactionID  有Attack     行数
External   2                          Timestamp, Attack        True      False           False     True      1
     IED   4 Timestamp, TargetIP, Attack, TransactionID        True       True            True     True     66
   SCADA   4 Timestamp, TargetIP, Attack, TransactionID        True       True            True     True 107770

标签质量评估:

External:
  ⚠️  缺少列: TargetIP, TransactionID
  前3行示例:
              Timestamp              Attack
2023-01-01 21:00:44.389 Recon. Range: 65535

IED:
  ✓ 所有必需列完整
  前3行示例:
              Timestamp    TargetIP                       Attack  TransactionID
2023-03-23 05:09:22.829 185.175.0.2 Baseline Replay: In position              1
2023-03-23 05:30:24.984 185.175.0.2 Baseline Replay: In position           1238
2023-03-23 05:51:26.274 185.175.0.2 Baseline R

In [5]:
print("\n" + "=" * 60)
print("Investigation 5: IP Address and Device Mapping Verification")
print("=" * 60)

#  Official documentation IP mapping
official_ip_mapping = {
    '185.175.0.4': 'IED1A (Secure)',
    '185.175.0.8': 'IED4C (Secure)',
    '185.175.0.5': 'IED1B (Normal)',
    '185.175.0.2': 'SCADA HMI (Secure)',
    '185.175.0.3': 'SCADA HMI (Normal)',
    '185.175.0.6': 'Central Agent',
    '185.175.0.7': 'Attacker'
}

print("\nOfficial IP mapping:")
print("-" * 60)
for ip, device in official_ip_mapping.items():
    print(f"  {ip} → {device}")

#  Extract actual IPs from PCAP
print("\nVerifying IP addresses from PCAP...")
from scapy.all import rdpcap, IP

#  Read a larger PCAP sample
test_pcap = glob.glob(os.path.join(config.BENIGN_PCAP_DIR, "*.pcap"))[0]
print(f"Test file: {os.path.basename(test_pcap)}")

packets = rdpcap(test_pcap, count=10000)

observed_ips = set()
for pkt in packets:
    if IP in pkt:
        observed_ips.add(pkt[IP].src)
        observed_ips.add(pkt[IP].dst)

print(f"\nObserved IP addresses ({len(observed_ips)}):")
print("-" * 60)

for ip in sorted(observed_ips):
    device = official_ip_mapping.get(ip, "❓ Unknown device")
    marker = "✓" if ip in official_ip_mapping else "⚠️"
    print(f"{marker} {ip:15s} → {device}")

#  Check for unknown IPs
unknown_ips = observed_ips - set(official_ip_mapping.keys())
if unknown_ips:
    print(f"\n⚠️  Found {len(unknown_ips)} unrecorded IP addresses:")
    for ip in unknown_ips:
        print(f"  - {ip}")
else:
    print("\n✓ All IP addresses are in official mapping")


排查5：IP地址和设备映射验证

官方IP映射:
------------------------------------------------------------
  185.175.0.4 → IED1A (Secure)
  185.175.0.8 → IED4C (Secure)
  185.175.0.5 → IED1B (Normal)
  185.175.0.2 → SCADA HMI (Secure)
  185.175.0.3 → SCADA HMI (Normal)
  185.175.0.6 → Central Agent
  185.175.0.7 → Attacker

从PCAP中验证IP地址...




测试文件: network-wide-normal-14.pcap

观察到的IP地址 (5 个):
------------------------------------------------------------
✓ 185.175.0.3     → SCADA HMI (Normal)
✓ 185.175.0.4     → IED1A (Secure)
✓ 185.175.0.5     → IED1B (Normal)
✓ 185.175.0.6     → Central Agent
✓ 185.175.0.8     → IED4C (Secure)

✓ 所有IP地址都在官方映射中


In [6]:
print("\n" + "=" * 60)
print("Verifying TargetIP from CSV Labels")
print("=" * 60)

#  Statistics of TargetIP appearing in CSV
target_ips = {}

for scenario, df in csv_samples.items():
    if 'TargetIP' in df.columns:
        ips = df['TargetIP'].value_counts()
        target_ips[scenario] = ips
        
        print(f"\n{scenario} CSV TargetIP distribution:")
        print("-" * 60)
        for ip, count in ips.items():
            device = official_ip_mapping.get(ip, "❓ Unknown")
            print(f"  {ip} → {device:25s} ({count:,} times)")
    else:
        print(f"\n{scenario}: ⚠️  No TargetIP column")


从CSV标签验证TargetIP

External: ⚠️  没有TargetIP列

IED CSV中的TargetIP分布:
------------------------------------------------------------
  185.175.0.2 → SCADA HMI (Secure)        (66 次)

SCADA CSV中的TargetIP分布:
------------------------------------------------------------
  185.175.0.8 → IED4C (Secure)            (49,644 次)
  185.175.0.4 → IED1A (Secure)            (33,192 次)
  185.175.0.5 → IED1B (Normal)            (24,934 次)


In [7]:
print("\n" + "=" * 60)
print("Comprehensive Diagnostic Report")
print("=" * 60)

issues = []

#  Issue 1: File organization
if suspicious_files:
    issues.append({
        'severity': 'High',
        'category': 'File Organization',
        'issue': f'Found {len(suspicious_files)} Benign files in External directory',
        'impact': 'External PCAP may contain normal traffic, needs filtering',
        'solution': 'Use IP address filtering, keep only src_ip=185.175.0.7 packets'
    })

#  Issue 2: Naming inconsistency
if len(naming_patterns['other']) > 0:
    issues.append({
        'severity': 'Medium',
        'category': 'Naming Convention',
        'issue': f'Found {len(naming_patterns["other"])} non-standard named files',
        'impact': 'File identification difficult, manual check needed',
        'solution': 'Create filename → capture type mapping table'
    })

#  Issue 3: CSV labels incomplete
if 'External' in csv_samples:
    ext_cols = csv_samples['External'].columns.tolist()
    if 'TargetIP' not in ext_cols or 'TransactionID' not in ext_cols:
        issues.append({
            'severity': 'High',
            'category': 'CSV Label Quality',
            'issue': 'External CSV missing TargetIP and TransactionID columns',
            'impact': 'Cannot use triple matching, can only use time window matching',
            'solution': 'Use IP identification + relaxed time window (10 seconds)'
        })

#  Issue 4: Unknown IP addresses
if unknown_ips:
    issues.append({
        'severity': 'Medium',
        'category': 'IP Mapping',
        'issue': f'Found {len(unknown_ips)} unrecorded IP addresses',
        'impact': 'May be external network addresses or gateways, need identification',
        'solution': 'Investigate these IP roles, update mapping table'
    })

if not issues:
    print("\n✅ No serious issues found, dataset quality is good")
else:
    print(f"\n⚠️  Found {len(issues)} issues:")
    print("\n")
    
    for i, issue in enumerate(issues, 1):
        print(f"Issue {i}:")
        print(f"  Severity: {issue['severity']}")
        print(f"  Category: {issue['category']}")
        print(f"  Description: {issue['issue']}")
        print(f"  Impact: {issue['impact']}")
        print(f"  Solution: {issue['solution']}")
        print()

#  Save diagnostic report
import json

report = {
    'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'total_pcap_files': len(df_pcap),
    'total_csv_files': len(df_csv),
    'suspicious_files': suspicious_files,
    'issues': issues
}

report_path = config.DATA_PROCESSED / "dataset_quality_report.json"
with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(report, f, indent=2, ensure_ascii=False)

print(f"✓ Diagnostic report saved: {report_path}")


综合诊断报告

⚠️  发现 2 个问题:


问题 1:
  严重程度: 高
  类别: 文件组织
  问题描述: 发现 2 个Benign文件在External目录
  影响: External PCAP可能包含正常流量，需要过滤
  解决方案: 使用IP地址过滤，只保留src_ip=185.175.0.7的包

问题 2:
  严重程度: 高
  类别: CSV标签质量
  问题描述: External的CSV缺少TargetIP和TransactionID列
  影响: 无法使用三元组匹配，只能用时间窗口匹配
  解决方案: 采用IP识别+宽松时间窗口（10秒）

✓ 诊断报告已保存: C:\Users\Echo\Desktop\modbus-detection\data\processed\dataset_quality_report.json
