# P22 IDS - PCAP Processing Notebook

This notebook demonstrates how to process PCAP files (packet captures).

## What You'll Learn:
1. Load and process PCAP files
2. Extract packet features
3. Analyze packet metadata
4. Run threat detection on packets
5. Visualize packet-level analysis

## Setup

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from collections import Counter

from orchestrator import ServiceOrchestrator
from services.dataIngestionService import DataIngestionService

# Check if Scapy is available
try:
    import scapy.all as scapy
    print("✓ Scapy is available")
except ImportError:
    print("⚠ Scapy not installed. Install with: pip install scapy")

print("✓ Setup complete!")

## 1. Create Sample PCAP File (Optional)

Skip this if you have a real PCAP file.

In [None]:
# Create sample PCAP file with synthetic packets
try:
    from scapy.all import IP, TCP, UDP, Ether, wrpcap
    
    packets = []
    
    # Create 50 TCP packets
    for i in range(50):
        pkt = Ether() / IP(src=f"192.168.1.{i%10}", dst=f"10.0.0.{i%5}") / \
              TCP(sport=1024+i, dport=80) / (f"Sample payload {i}" * 10)
        packets.append(pkt)
    
    # Create 30 UDP packets
    for i in range(30):
        pkt = Ether() / IP(src=f"192.168.1.{i%10}", dst=f"10.0.0.{i%5}") / \
              UDP(sport=5000+i, dport=53) / (f"DNS query {i}" * 5)
        packets.append(pkt)
    
    # Save to file
    wrpcap('../sample_capture.pcap', packets)
    print(f"✓ Created sample PCAP file with {len(packets)} packets")
    
except Exception as e:
    print(f"Could not create sample PCAP: {e}")
    print("Please provide your own PCAP file")

## 2. Initialize IDS System

In [None]:
# Initialize orchestrator
orchestrator = ServiceOrchestrator('../config.example.yaml')

if orchestrator.initialize():
    print("✓ Services initialized!")
    
    # Check status
    status = orchestrator.getSystemStatus()
    for name, info in status['services'].items():
        print(f"  {name}: {info['status']}")

## 3. Process PCAP File

In [None]:
# Process PCAP file
pcap_file = '../sample_capture.pcap'  # Change to your PCAP file

print(f"Processing PCAP file: {pcap_file}")
pcap_result = orchestrator.processDataFile(pcap_file, fileType='pcap')

print(f"\n✓ PCAP Processing Complete!")
print(f"  File Type: {pcap_result['fileType']}")
print(f"  Packets: {pcap_result['packetCount']}")
print(f"  Feature Shape: {pcap_result['features'].shape}")

## 4. Analyze Packet Metadata

In [None]:
# Extract metadata
metadata = pcap_result['metadata']

# Create DataFrame for easier analysis
metadata_df = pd.DataFrame(metadata)

print("=== Packet Metadata ===")
print(metadata_df.head(10))

print(f"\nTotal Packets: {len(metadata_df)}")
print(f"Unique Source IPs: {metadata_df['srcIP'].nunique()}")
print(f"Unique Dest IPs: {metadata_df['dstIP'].nunique()}")

In [None]:
# Protocol distribution
protocol_counts = Counter(metadata_df['protocol'].dropna())

print("\n=== Protocol Distribution ===")
for protocol, count in protocol_counts.items():
    print(f"{protocol}: {count} packets ({count/len(metadata_df)*100:.1f}%)")

# Plot
plt.figure(figsize=(8, 5))
plt.bar(protocol_counts.keys(), protocol_counts.values(), color='steelblue')
plt.title('Protocol Distribution')
plt.xlabel('Protocol')
plt.ylabel('Packet Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Packet size distribution
packet_sizes = metadata_df['length'].dropna()

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.hist(packet_sizes, bins=30, color='lightcoral', edgecolor='black')
plt.title('Packet Size Distribution')
plt.xlabel('Size (bytes)')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
plt.boxplot(packet_sizes)
plt.title('Packet Size Box Plot')
plt.ylabel('Size (bytes)')

plt.tight_layout()
plt.show()

print(f"Average Packet Size: {packet_sizes.mean():.2f} bytes")
print(f"Median Packet Size: {packet_sizes.median():.2f} bytes")
print(f"Min/Max: {packet_sizes.min():.0f} / {packet_sizes.max():.0f} bytes")

## 5. Run Threat Detection

In [None]:
# Run detection with both models
print("Running threat detection...")

detection_result = orchestrator.runInference(
    data=pcap_result,
    modelType='both',
    aggregate=True
)

print("\n✓ Detection Complete!")

In [None]:
# Display detection results
print("=== Threat Detection Results ===")
print(f"Model: {detection_result['modelType']}")
print(f"Aggregation: {detection_result.get('aggregationMethod', 'N/A')}")
print(f"Confidence: {detection_result['confidence']:.4f}")

# Count threats
predictions = detection_result['predictions']
threats = sum(1 for p in predictions if p != 0)

print(f"\nThreat Summary:")
print(f"  Total Packets Analyzed: {len(predictions)}")
print(f"  Normal Traffic: {len(predictions) - threats}")
print(f"  Threats Detected: {threats}")
print(f"  Threat Rate: {threats/len(predictions)*100:.2f}%")

## 6. Visualize Detection Results

In [None]:
# Plot predictions over time
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(predictions, 'o-', markersize=3, alpha=0.6)
plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.5, label='Threat Threshold')
plt.title('Predictions Over Packets')
plt.xlabel('Packet Index')
plt.ylabel('Prediction (0=Normal, 1+=Threat)')
plt.legend()
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
pred_counts = Counter(predictions)
plt.bar(pred_counts.keys(), pred_counts.values(), color='orange', edgecolor='black')
plt.title('Prediction Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Identify suspicious packets
threat_indices = [i for i, p in enumerate(predictions) if p != 0]

if threat_indices:
    print(f"\n=== Suspicious Packets (First 10) ===")
    threat_metadata = metadata_df.iloc[threat_indices[:10]]
    print(threat_metadata[['srcIP', 'dstIP', 'srcPort', 'dstPort', 'protocol', 'length']])
else:
    print("\n✓ No threats detected - all traffic appears normal")

## 7. Compare LSTM vs CNN for PCAP

In [None]:
# Run models separately
lstm_result = orchestrator.runInference(pcap_result, 'lstm', False)
cnn_result = orchestrator.runInference(pcap_result, 'cnn', False)

print("=== Model Comparison ===")
print(f"\nLSTM (Temporal Analysis):")
print(f"  Threats Detected: {sum(1 for p in lstm_result['predictions'] if p != 0)}")
print(f"  Avg Confidence: {np.mean(lstm_result['confidences']):.4f}")

print(f"\nCNN (Spatial/Packet Analysis):")
print(f"  Threats Detected: {sum(1 for p in cnn_result['predictions'] if p != 0)}")
print(f"  Avg Confidence: {np.mean(cnn_result['confidences']):.4f}")

# Calculate agreement
agreement = sum(1 for l, c in zip(lstm_result['predictions'], cnn_result['predictions']) if l == c)
print(f"\nModel Agreement: {agreement}/{len(predictions)} ({agreement/len(predictions)*100:.1f}%)")

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# LSTM
axes[0].plot(lstm_result['predictions'], 'b-', alpha=0.6, label='LSTM')
axes[0].set_title('LSTM Predictions')
axes[0].set_xlabel('Packet Index')
axes[0].set_ylabel('Prediction')
axes[0].grid(alpha=0.3)

# CNN
axes[1].plot(cnn_result['predictions'], 'g-', alpha=0.6, label='CNN')
axes[1].set_title('CNN Predictions')
axes[1].set_xlabel('Packet Index')
axes[1].set_ylabel('Prediction')
axes[1].grid(alpha=0.3)

# Both overlayed
axes[2].plot(lstm_result['predictions'], 'b-', alpha=0.5, label='LSTM')
axes[2].plot(cnn_result['predictions'], 'g-', alpha=0.5, label='CNN')
axes[2].set_title('LSTM vs CNN')
axes[2].set_xlabel('Packet Index')
axes[2].set_ylabel('Prediction')
axes[2].legend()
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Save Analysis Report

In [None]:
# Create analysis report
report = {
    'file': pcap_file,
    'timestamp': pd.Timestamp.now().isoformat(),
    'packet_count': len(metadata_df),
    'protocol_distribution': dict(protocol_counts),
    'packet_size_stats': {
        'mean': float(packet_sizes.mean()),
        'median': float(packet_sizes.median()),
        'min': int(packet_sizes.min()),
        'max': int(packet_sizes.max())
    },
    'detection_results': {
        'threats_detected': threats,
        'threat_rate': float(threats/len(predictions)*100),
        'confidence': float(detection_result['confidence'])
    },
    'model_comparison': {
        'lstm_threats': int(sum(1 for p in lstm_result['predictions'] if p != 0)),
        'cnn_threats': int(sum(1 for p in cnn_result['predictions'] if p != 0)),
        'agreement_rate': float(agreement/len(predictions)*100)
    }
}

# Save report
report_file = '../pcap_analysis_report.json'
with open(report_file, 'w') as f:
    json.dump(report, f, indent=2)

print(f"✓ Analysis report saved to: {report_file}")
print("\n" + json.dumps(report, indent=2))

## 9. Cleanup

In [None]:
# Shutdown services
orchestrator.shutdown()
print("✓ Services shut down successfully!")

## Summary

In this notebook, you learned:
1. ✓ How to process PCAP files
2. ✓ Extract and analyze packet metadata
3. ✓ Run threat detection on packet captures
4. ✓ Compare LSTM vs CNN for packet analysis
5. ✓ Visualize packet-level results
6. ✓ Generate analysis reports

**Key Insights:**
- **CNN** is better at analyzing individual packet structures (spatial patterns)
- **LSTM** is better at analyzing packet sequences (temporal patterns)
- **Ensemble** combines both for robust detection

**Next Steps:**
- Try with real network captures
- Experiment with different maxPackets settings
- Learn about model training in the next notebook