In [1]:
import pandas as pd
import numpy as np
from scapy.all import rdpcap, TCP
import os
import glob
from datetime import datetime, timedelta
from tqdm import tqdm
import sys
import pickle

sys.path.append(r'C:\Users\Echo\Desktop\modbus-detection\src')
import config

#  Load Phase 1 results
with open(config.DATA_PROCESSED / 'stage1_results.pkl', 'rb') as f:
    stage1 = pickle.load(f)

print("✓ Phase 1 results loaded successfully")
print(f"External: {stage1['total_external']:,}")
print(f"IED: {stage1['total_ied']:,}")
print(f"SCADA: {stage1['total_scada']:,}")
print(f"\nStarting quick test (5% data)...")



✓ 阶段1结果加载成功
External: 131,207
IED: 676
SCADA: 1,288,159

开始快速测试（5%数据）...


In [2]:
def parse_pcap_modbus(pcap_path, is_attack=False):
    """
    Parse a single PCAP file and extract Modbus packets
    
    Args:
        pcap_path: PCAP file path
        is_attack: Whether this is attack data (requires timezone conversion)
    
    Returns:
        DataFrame containing parsed Modbus packets
    """
    packets_data = []
    
    try:
        packets = rdpcap(pcap_path)
        
        for pkt in packets:
            #  Only process Modbus packets (TCP port 502)
            if TCP not in pkt or (pkt[TCP].sport != 502 and pkt[TCP].dport != 502):
                continue
            
            try:
                #  Extract timestamp
                timestamp = datetime.fromtimestamp(float(pkt.time))
                
                #  If attack data, timezone conversion needed (ADT → UTC, +3 hours)
                if is_attack:
                    timestamp = timestamp + timedelta(hours=3)
                
                #  Extract network layer info
                src_ip = pkt['IP'].src
                dst_ip = pkt['IP'].dst
                src_port = pkt[TCP].sport
                dst_port = pkt[TCP].dport
                
                #  Extract Modbus application layer data
                payload = bytes(pkt[TCP].payload)
                
                if len(payload) < 8:  #  MBAP Header at least 7 bytes + 1 byte Function Code
                    continue
                
                #  Parse MBAP Header
                transaction_id = int.from_bytes(payload[0:2], byteorder='big')
                protocol_id = int.from_bytes(payload[2:4], byteorder='big')
                length = int.from_bytes(payload[4:6], byteorder='big')
                unit_id = payload[6]
                function_code = payload[7]
                
                #  Parse specific fields based on Function Code
                start_address = None
                quantity = None
                data_length = None
                
                if function_code in [0x01, 0x02, 0x03, 0x04]:  #  Read operations
                    if len(payload) >= 12:
                        start_address = int.from_bytes(payload[8:10], byteorder='big')
                        quantity = int.from_bytes(payload[10:12], byteorder='big')
                        data_length = length - 2
                
                elif function_code in [0x05, 0x06]:  #  Write Single
                    if len(payload) >= 12:
                        start_address = int.from_bytes(payload[8:10], byteorder='big')
                        data_length = length - 2
                
                elif function_code in [0x0F, 0x10]:  #  Write Multiple
                    if len(payload) >= 13:
                        start_address = int.from_bytes(payload[8:10], byteorder='big')
                        quantity = int.from_bytes(payload[10:12], byteorder='big')
                        data_length = payload[12]
                
                #  Build data record
                packet_info = {
                    'timestamp': timestamp,
                    'src_ip': src_ip,
                    'dst_ip': dst_ip,
                    'src_port': src_port,
                    'dst_port': dst_port,
                    'transaction_id': transaction_id,
                    'protocol_id': protocol_id,
                    'length': length,
                    'unit_id': unit_id,
                    'function_code': function_code,
                    'start_address': start_address,
                    'quantity': quantity,
                    'data_length': data_length,
                    'payload_size': len(payload)
                }
                
                packets_data.append(packet_info)
                
            except Exception as e:
                #  Skip packets that failed to parse
                continue
                
    except Exception as e:
        print(f"Failed to read PCAP: {pcap_path} - {e}")
        return pd.DataFrame()
    
    return pd.DataFrame(packets_data)

print("✓ PCAP parsing function defined")

✓ PCAP解析函数定义完成


In [3]:
#  Test parsing a Benign file
test_pcap = glob.glob(os.path.join(config.BENIGN_PCAP_DIR, "*.pcap"))[0]
print(f"Test file: {os.path.basename(test_pcap)}")

df_test = parse_pcap_modbus(test_pcap, is_attack=False)
print(f"✓ Successfully parsed {len(df_test):,} Modbus packets")
print("\nFirst 5 records:")
print(df_test.head())

测试文件: network-wide-normal-14.pcap
✓ 成功解析 216,716 个Modbus包

前5条记录:
                   timestamp       src_ip       dst_ip  src_port  dst_port  \
0 2023-01-24 04:52:05.777466  185.175.0.3  185.175.0.4     37266       502   
1 2023-01-24 04:52:05.778793  185.175.0.4  185.175.0.3       502     37266   
2 2023-01-24 04:52:05.799833  185.175.0.3  185.175.0.4     37268       502   
3 2023-01-24 04:52:05.800935  185.175.0.4  185.175.0.3       502     37268   
4 2023-01-24 04:52:05.821522  185.175.0.3  185.175.0.4     37270       502   

   transaction_id  protocol_id  length  unit_id  function_code  start_address  \
0           16245            0       6        1              1           14.0   
1           16245            0       4        1              1            NaN   
2           16246            0       6        1              1           13.0   
3           16246            0       4        1              1            NaN   
4           16247            0       6        1             

In [4]:
print("=" * 60)
print("Quick Sampling Benign Data (5% test)")
print("=" * 60)

benign_pcaps = glob.glob(os.path.join(config.BENIGN_PCAP_DIR, "*.pcap"))

#  Only process 5% of files
sample_size = max(1, len(benign_pcaps) // 20)
sampled_pcaps = np.random.choice(benign_pcaps, sample_size, replace=False)

print(f"Sampling {len(sampled_pcaps)} files from {len(benign_pcaps)} files")

benign_data = []
for pcap_file in tqdm(sampled_pcaps, desc="Benign"):
    df = parse_pcap_modbus(pcap_file, is_attack=False)
    if len(df) > 0:
        df['source'] = 'benign'
        df['label'] = 'normal'
        #  Random sampling (target ~10000 packets)
        if len(df) > 10000:
            df = df.sample(n=10000, random_state=42)
        benign_data.append(df)

if benign_data:
    df_benign = pd.concat(benign_data, ignore_index=True)
    print(f"✓ Benign data: {len(df_benign):,} records")
else:
    df_benign = pd.DataFrame()
    print("✗ Benign data is empty")

快速采样Benign数据（5%测试）
从 19 个文件中采样 1 个


Benign: 100%|██████████| 1/1 [05:06<00:00, 306.96s/it]


✓ Benign数据: 10,000 条


In [5]:
print("\n" + "=" * 60)
print("Processing External Attack Data")
print("=" * 60)

external_pcaps = glob.glob(os.path.join(config.EXTERNAL_PCAP_DIR, "*.pcap"))

#  Only take the first PCAP for testing
test_external_pcap = external_pcaps[0] if external_pcaps else None

if test_external_pcap:
    print(f"Test file: {os.path.basename(test_external_pcap)}")
    df_external = parse_pcap_modbus(test_external_pcap, is_attack=True)
    df_external['source'] = 'external'
    df_external['label'] = 'attack'
    
    print(f"✓ External data: {len(df_external):,} records")
    print(f"Time range: {df_external['timestamp'].min()} to {df_external['timestamp'].max()}")
else:
    df_external = pd.DataFrame()


处理External Attack数据
测试文件: network-wide-normal-0.pcap
✓ External数据: 208,011 条
时间范围: 2023-02-01 14:48:39.921816 到 2023-02-02 00:06:40.878403


In [6]:
print("\n" + "=" * 60)
print("Merging Data")
print("=" * 60)

all_data = []
if len(df_benign) > 0:
    all_data.append(df_benign)
if len(df_external) > 0:
    all_data.append(df_external)

if all_data:
    df_quick_test = pd.concat(all_data, ignore_index=True)
    
    print(f"Total data: {len(df_quick_test):,} records")
    print(f"\nLabel distribution:")
    print(df_quick_test['label'].value_counts())
    
    #  Save
    output_path = config.DATA_PROCESSED / "quick_test_sample.csv"
    df_quick_test.to_csv(output_path, index=False)
    
    print(f"\n✓ Quick test data saved to: {output_path}")
    print(f"File size: {output_path.stat().st_size / (1024**2):.2f} MB")
    
    print("\n✓ Quick test completed!")
    print("Code verification passed, ready to run full version.")
else:
    print("✗ No data")


合并数据
总数据量: 218,011 条

标签分布:
label
attack    208011
normal     10000
Name: count, dtype: int64

✓ 快速测试数据已保存到: C:\Users\Echo\Desktop\modbus-detection\data\processed\quick_test_sample.csv
文件大小: 21.28 MB

✓ 快速测试完成！
代码验证通过，可以运行完整版本。


In [7]:
print("\n" + "=" * 60)
print("Phase 2 Summary")
print("=" * 60)

print(f"""
✓ Quick test completed:
  - Test data size: {len(df_quick_test):,} records
  - PCAP parsing: OK ✓
  - Timezone conversion: OK ✓
  - Data saving: OK ✓
  
⏭️  Ready: Can start full overnight run
Completion time: {datetime.now().strftime('%H:%M:%S')}
""")


阶段2总结

✓ 快速测试完成：
  - 测试数据量: 218,011 条
  - PCAP解析功能: 正常 ✓
  - 时区转换: 正常 ✓
  - 数据保存: 正常 ✓
  
⏭️  准备就绪：可以启动完整版过夜运行
完成时间: 16:42:41

