In [7]:
from scapy.all import *
import random
import string
import time
import os
import sys
from datetime import datetime
import numpy as np
from collections import defaultdict

import logging
logging.getLogger("scapy").setLevel(logging.ERROR)



In [8]:
class AdvancedStegoGenerator:

    def __init__(self, target_networks, output_dir="stego_dataset"):
        self.target_networks = target_networks
        self.output_dir = output_dir
        self.packets = []
        self.stats = defaultdict(int)
        
        os.makedirs(output_dir, exist_ok=True)
        
        self.secret_messages = [
            b"EXFILTRATE: Database credentials - admin:P@ssw0rd123",
            b"C2_SERVER: 185.220.101.45:8443",
            b"MALWARE_PAYLOAD: base64encoded_shellcode_here",
            b"RANSOMWARE_KEY: AES256_key_transmission",
            b"STOLEN_DATA: Credit card numbers 4532-1234-5678-9010",
            b"BACKDOOR_ACTIVE: Listening on port 31337",
            b"DATA_BREACH: User database 50MB transferring",
            b"APT_COMMAND: Execute payload on target systems"
        ] * 100  # Repeat for more data
        
        self.internal_ips = [f"192.168.{random.randint(1,254)}.{random.randint(1,254)}" 
                            for _ in range(50)]
        self.external_ips = [f"{random.randint(1,223)}.{random.randint(1,254)}."
                            f"{random.randint(1,254)}.{random.randint(1,254)}" 
                            for _ in range(100)]
        
       
    def generate_ip_id_steganography(self, num_packets=10000):
        print(f"[1/10] Generating IP ID Steganography: {num_packets} packets...")
        technique_packets = []
        
        secret_data = b"".join(random.choices(self.secret_messages, k=num_packets//100))
        secret_bits = ''.join(format(byte, '08b') for byte in secret_data)
        
        for i in range(num_packets):
            src_ip = random.choice(self.internal_ips)
            dst_ip = random.choice(self.external_ips)
            
            # Embed data in IP ID field (2 bits per packet for higher capacity)
            if i * 2 < len(secret_bits):
                bit1 = int(secret_bits[i*2]) if i*2 < len(secret_bits) else 0
                bit2 = int(secret_bits[i*2+1]) if i*2+1 < len(secret_bits) else 0
                
                # Base IP ID + embed 2 bits in LSB positions
                base_id = random.randint(10000, 60000)
                ip_id = (base_id & 0xFFFC) | (bit1 << 1) | bit2
            else:
                ip_id = random.randint(10000, 65000)
            
            # Create realistic HTTP traffic
            packet = IP(src=src_ip, dst=dst_ip, id=ip_id, ttl=64) / \
                     TCP(sport=random.randint(49152, 65535), dport=80, 
                         flags="PA", seq=random.randint(1000, 4000000000)) / \
                     Raw(load=f"GET /search?q=query{i} HTTP/1.1\r\n"
                              f"Host: example.com\r\nUser-Agent: Mozilla/5.0\r\n\r\n")
            
            technique_packets.append(packet)
            
            # Add occasional response packets for realism
            if i % 10 == 0:
                response = IP(src=dst_ip, dst=src_ip, id=random.randint(10000, 65000), ttl=56) / \
                          TCP(sport=80, dport=packet[TCP].sport, flags="PA") / \
                          Raw(load=b"HTTP/1.1 200 OK\r\nContent-Length: 128\r\n\r\n")
                technique_packets.append(response)
        
        self.packets.extend(technique_packets)
        self.stats['ip_id_steganography'] = len(technique_packets)
        print(f"     Generated {len(technique_packets)} packets")
        return technique_packets
    
    
    def generate_ttl_covert_channel(self, num_packets=8000):
        print(f"[2/10] Generating TTL Covert Channel: {num_packets} packets...")
        technique_packets = []
        
        secret_data = b"".join(random.choices(self.secret_messages, k=num_packets//100))
        secret_bits = ''.join(format(byte, '08b') for byte in secret_data)
        
        for i in range(num_packets):
            src_ip = random.choice(self.internal_ips)
            dst_ip = random.choice(self.external_ips)
            
            # Encode bit in TTL: even=0, odd=1
            if i < len(secret_bits):
                bit = int(secret_bits[i])
                ttl = 63 if bit == 1 else 64
            else:
                ttl = random.choice([63, 64])
            
            # ICMP Echo Request (ping-like traffic)
            packet = IP(src=src_ip, dst=dst_ip, ttl=ttl) / \
                     ICMP(type=8, code=0, id=random.randint(1, 65535), 
                          seq=i % 65536) / \
                     Raw(load=b"X" * random.randint(32, 64))
            
            technique_packets.append(packet)
        
        self.packets.extend(technique_packets)
        self.stats['ttl_covert_channel'] = len(technique_packets)
        print(f"     Generated {len(technique_packets)} packets")
        return technique_packets
    
    
    def generate_tcp_isn_steganography(self, num_packets=12000):
        print(f"[3/10] Generating TCP ISN Steganography: {num_packets} packets...")
        technique_packets = []
        
        secret_data = b"".join(random.choices(self.secret_messages, k=num_packets//50))
        
        for i in range(num_packets):
            src_ip = random.choice(self.internal_ips)
            dst_ip = random.choice(self.external_ips)
            
            # Embed byte in lower 8 bits of ISN
            base_isn = random.randint(1000000, 4000000000)
            if i < len(secret_data):
                isn = (base_isn & 0xFFFFFF00) | secret_data[i]
            else:
                isn = random.randint(1000000, 4294967295)
            
            # TCP SYN packet (connection initiation)
            packet = IP(src=src_ip, dst=dst_ip, id=random.randint(1, 65535), ttl=64) / \
                     TCP(sport=random.randint(49152, 65535), 
                         dport=random.choice([80, 443, 8080, 22, 3389]),
                         flags="S", seq=isn, window=65535)
            
            technique_packets.append(packet)
            
            # Add SYN-ACK responses for realism
            if i % 5 == 0:
                synack = IP(src=dst_ip, dst=src_ip, ttl=56) / \
                        TCP(sport=packet[TCP].dport, dport=packet[TCP].sport,
                            flags="SA", seq=random.randint(1000000, 4000000000),
                            ack=isn+1)
                technique_packets.append(synack)
        
        self.packets.extend(technique_packets)
        self.stats['tcp_isn_steganography'] = len(technique_packets)
        print(f"     Generated {len(technique_packets)} packets")
        return technique_packets
    
    def generate_tcp_timestamp_steganography(self, num_packets=8000):
        print(f"[4/10] Generating TCP Timestamp Steganography: {num_packets} packets...")
        technique_packets = []
        
        secret_data = b"".join(random.choices(self.secret_messages, k=num_packets//100))
        
        for i in range(num_packets):
            src_ip = random.choice(self.internal_ips)
            dst_ip = random.choice(self.external_ips)
            
            # Embed data in TCP timestamp LSBs
            base_timestamp = int(time.time() * 1000) % (2**32)
            if i < len(secret_data):
                timestamp = (base_timestamp & 0xFFFFFF00) | secret_data[i]
            else:
                timestamp = base_timestamp
            
            packet = IP(src=src_ip, dst=dst_ip, ttl=64) / \
                     TCP(sport=random.randint(49152, 65535), dport=443,
                         flags="PA", seq=random.randint(1000, 4000000000),
                         options=[('Timestamp', (timestamp, 0))]) / \
                     Raw(load=b"A" * random.randint(100, 500))
            
            technique_packets.append(packet)
        
        self.packets.extend(technique_packets)
        self.stats['tcp_timestamp_stego'] = len(technique_packets)
        print(f"    Generated {len(technique_packets)} packets")
        return technique_packets
    
    def generate_timing_channel(self, num_packets=5000):
        print(f"[5/10] Generating Timing Channel: {num_packets} packets...")
        technique_packets = []
        
        secret_data = b"".join(random.choices(self.secret_messages, k=num_packets//200))
        secret_bits = ''.join(format(byte, '08b') for byte in secret_data)
        
        base_time = time.time()
        
        for i in range(num_packets):
            src_ip = random.choice(self.internal_ips)
            dst_ip = random.choice(self.external_ips)
            
            # Encode timing information (stored as packet timestamp delta)
            if i < len(secret_bits):
                bit = int(secret_bits[i])
                # 0 = 0.015s spacing, 1 = 0.055s spacing
                time_delta = 0.055 if bit == 1 else 0.015
            else:
                time_delta = random.uniform(0.015, 0.055)
            
            current_time = base_time + (i * time_delta)
            
            packet = IP(src=src_ip, dst=dst_ip, ttl=64) / \
                     TCP(sport=random.randint(49152, 65535), dport=443,
                         flags="PA") / \
                     Raw(load=b"ENCRYPTED_DATA")
            
            packet.time = current_time
            technique_packets.append(packet)
        
        self.packets.extend(technique_packets)
        self.stats['timing_channel'] = len(technique_packets)
        print(f"     Generated {len(technique_packets)} packets")
        return technique_packets
    
    
    def generate_packet_size_modulation(self, num_packets=10000):
        print(f"[6/10] Generating Packet Size Modulation: {num_packets} packets...")
        technique_packets = []
        
        secret_data = b"".join(random.choices(self.secret_messages, k=num_packets//100))
        secret_bits = ''.join(format(byte, '08b') for byte in secret_data)
        
        for i in range(num_packets):
            src_ip = random.choice(self.internal_ips)
            dst_ip = random.choice(self.external_ips)
            
            # Encode bit through packet size
            if i < len(secret_bits):
                bit = int(secret_bits[i])
                payload_size = random.randint(800, 1400) if bit == 1 else random.randint(100, 300)
            else:
                payload_size = random.randint(100, 1400)
            
            payload = bytes([random.randint(0, 255) for _ in range(payload_size)])
            
            packet = IP(src=src_ip, dst=dst_ip, ttl=64) / \
                     TCP(sport=random.randint(49152, 65535), dport=80, flags="PA") / \
                     Raw(load=payload)
            
            technique_packets.append(packet)
        
        self.packets.extend(technique_packets)
        self.stats['size_modulation'] = len(technique_packets)
        print(f"     Generated {len(technique_packets)} packets")
        return technique_packets
    
    def generate_dns_tunneling(self, num_packets=8000):
        print(f"[7/10] Generating DNS Tunneling: {num_packets} packets...")
        technique_packets = []
        
        secret_data = b"".join(random.choices(self.secret_messages, k=num_packets//50))
        domains = ["example.com", "test.org", "data.net", "secure.io"]
        
        for i in range(num_packets):
            src_ip = random.choice(self.internal_ips)
            dns_server = random.choice(["8.8.8.8", "1.1.1.1", "208.67.222.222"])
            
            # Encode data in subdomain (hex encoding)
            if i < len(secret_data):
                encoded = secret_data[i:i+4].hex()
                subdomain = f"{encoded}.covert"
            else:
                subdomain = f"normal{random.randint(1000,9999)}"
            
            domain = random.choice(domains)
            query_name = f"{subdomain}.{domain}"
            
            # DNS Query
            packet = IP(src=src_ip, dst=dns_server, ttl=64) / \
                     UDP(sport=random.randint(49152, 65535), dport=53) / \
                     DNS(id=random.randint(1, 65535), qr=0, opcode=0,
                         qd=DNSQR(qname=query_name, qtype='A'))
            
            technique_packets.append(packet)
            
            # Add DNS response for realism
            if i % 3 == 0:
                response = IP(src=dns_server, dst=src_ip, ttl=56) / \
                          UDP(sport=53, dport=packet[UDP].sport) / \
                          DNS(id=packet[DNS].id, qr=1, aa=0, qd=packet[DNS].qd,
                              an=DNSRR(rrname=query_name, ttl=300, 
                                      rdata=f"192.168.{random.randint(1,254)}.{random.randint(1,254)}"))
                technique_packets.append(response)
        
        self.packets.extend(technique_packets)
        self.stats['dns_tunneling'] = len(technique_packets)
        print(f"     Generated {len(technique_packets)} packets")
        return technique_packets
    
    def generate_high_entropy_payloads(self, num_packets=10000):
        print(f"[8/10] Generating High-Entropy Payloads: {num_packets} packets...")
        technique_packets = []
        
        for i in range(num_packets):
            src_ip = random.choice(self.internal_ips)
            dst_ip = random.choice(self.external_ips)
            
            # Generate high-entropy random data
            payload_size = random.randint(200, 1400)
            payload = bytes([random.randint(0, 255) for _ in range(payload_size)])
            
            # Calculate and ensure high entropy (> 7.0)
            byte_counts = np.bincount(list(payload), minlength=256)
            probabilities = byte_counts / len(payload)
            entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))
            
            # If entropy too low, randomize more
            while entropy < 7.0:
                payload = bytes([random.randint(0, 255) for _ in range(payload_size)])
                byte_counts = np.bincount(list(payload), minlength=256)
                probabilities = byte_counts / len(payload)
                entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))
            
            packet = IP(src=src_ip, dst=dst_ip, ttl=64) / \
                     TCP(sport=random.randint(49152, 65535), dport=443,
                         flags="PA") / \
                     Raw(load=payload)
            
            technique_packets.append(packet)
        
        self.packets.extend(technique_packets)
        self.stats['high_entropy'] = len(technique_packets)
        print(f"     Generated {len(technique_packets)} packets")
        return technique_packets
    
    def generate_reserved_bits_steganography(self, num_packets=7000):
        print(f"[9/10] Generating Reserved Bits Steganography: {num_packets} packets...")
        technique_packets = []
        
        secret_data = b"".join(random.choices(self.secret_messages, k=num_packets//100))
        secret_bits = ''.join(format(byte, '08b') for byte in secret_data)
        
        for i in range(num_packets):
            src_ip = random.choice(self.internal_ips)
            dst_ip = random.choice(self.external_ips)
            
            # Encode in IP flags (Reserved bit)
            if i < len(secret_bits):
                bit = int(secret_bits[i])
                ip_flags = 0x4 if bit == 1 else 0x2  # Using reserved combinations
            else:
                ip_flags = random.choice([0x2, 0x4])
            
            packet = IP(src=src_ip, dst=dst_ip, flags=ip_flags, ttl=64) / \
                     TCP(sport=random.randint(49152, 65535), dport=80,
                         flags="PA") / \
                     Raw(load=b"GET / HTTP/1.1\r\n\r\n")
            
            technique_packets.append(packet)
        
        self.packets.extend(technique_packets)
        self.stats['reserved_bits'] = len(technique_packets)
        print(f"     Generated {len(technique_packets)} packets")
        return technique_packets
    
    def generate_protocol_anomalies(self, num_packets=8000):
        print(f"[10/10] Generating Protocol Anomalies: {num_packets} packets...")
        technique_packets = []
        
        for i in range(num_packets):
            src_ip = random.choice(self.internal_ips)
            dst_ip = random.choice(self.external_ips)
            
            anomaly_type = random.choice([
                'unusual_port_combo',
                'malformed_flags',
                'odd_window_size',
                'suspicious_urgptr'
            ])
            
            if anomaly_type == 'unusual_port_combo':
                # High source port to high dest port (unusual)
                packet = IP(src=src_ip, dst=dst_ip, ttl=64) / \
                        TCP(sport=random.randint(60000, 65535),
                            dport=random.randint(60000, 65535),
                            flags="PA") / Raw(load=b"DATA")
            
            elif anomaly_type == 'malformed_flags':
                # Unusual flag combinations
                flags = random.choice(["FPU", "FSRPA", "FSRA"])
                packet = IP(src=src_ip, dst=dst_ip, ttl=64) / \
                        TCP(sport=random.randint(49152, 65535), dport=443,
                            flags=flags) / Raw(load=b"DATA")
            
            elif anomaly_type == 'odd_window_size':
                # Unusual window size patterns
                window = random.choice([0, 1, 7, 31, 63])
                packet = IP(src=src_ip, dst=dst_ip, ttl=64) / \
                        TCP(sport=random.randint(49152, 65535), dport=80,
                            flags="PA", window=window) / Raw(load=b"DATA")
            
            else:  # suspicious_urgptr
                # Non-zero urgent pointer without URG flag
                packet = IP(src=src_ip, dst=dst_ip, ttl=64) / \
                        TCP(sport=random.randint(49152, 65535), dport=443,
                            flags="PA", urgptr=random.randint(1, 100)) / \
                        Raw(load=b"DATA")
            
            technique_packets.append(packet)
        
        self.packets.extend(technique_packets)
        self.stats['protocol_anomalies'] = len(technique_packets)
        print(f"     Generated {len(technique_packets)} packets")
        return technique_packets
    
    
    def generate_all_techniques(self):
        start_time = time.time()
        
        print("\nStarting comprehensive steganography generation...\n")
        
        # Generate each technique
        self.generate_ip_id_steganography(10000)
        self.generate_ttl_covert_channel(8000)
        self.generate_tcp_isn_steganography(12000)
        self.generate_tcp_timestamp_steganography(8000)
        self.generate_timing_channel(5000)
        self.generate_packet_size_modulation(10000)
        self.generate_dns_tunneling(8000)
        self.generate_high_entropy_payloads(10000)
        self.generate_reserved_bits_steganography(7000)
        self.generate_protocol_anomalies(8000)
        
        elapsed = time.time() - start_time
        
        print("\n" + "="*70)
        print(" GENERATION COMPLETE")
        print("="*70)
        print(f" Total Packets Generated: {len(self.packets):,}")
        print(f" Time Taken: {elapsed:.2f} seconds")
        print(f" Generation Rate: {len(self.packets)/elapsed:.0f} packets/second")
        print("="*70 + "\n")
        
        return self.packets
    
    def save_dataset(self, filename=None):
        """Save generated packets to PCAP file"""
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"steganography_dataset_{timestamp}.pcap"
        
        filepath = os.path.join(self.output_dir, filename)
        
        print(f"Saving dataset to: {filepath}")
        wrpcap(filepath, self.packets)
        print(f"Successfully saved {len(self.packets):,} packets!")
        
        self.generate_report()
        
        return filepath
    
    def generate_report(self):
        report_file = os.path.join(self.output_dir, "dataset_report.txt")
        
        with open(report_file, 'w') as f:
            f.write("="*70 + "\n")
            f.write(" STEGANOGRAPHY DATASET GENERATION REPORT\n")
            f.write("="*70 + "\n\n")
            f.write(f"Generation Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Total Packets: {len(self.packets):,}\n\n")
            
            f.write("Breakdown by Technique:\n")
            f.write("-" * 70 + "\n")
            for technique, count in sorted(self.stats.items(), key=lambda x: x[1], reverse=True):
                percentage = (count / len(self.packets)) * 100
                f.write(f"  {technique:<35} {count:>8,} packets ({percentage:>5.2f}%)\n")
            
            f.write("\n" + "="*70 + "\n")
            f.write("Dataset is ready for ML training!\n")
            f.write("Next steps:\n")
            f.write("  1. Merge with benign traffic dataset\n")
            f.write("  2. Label data (0=benign, 1=steganography)\n")
            f.write("  3. Extract features using feature_extraction.py\n")
            f.write("  4. Train ML models using train.py\n")
            f.write("="*70 + "\n")
        
        print(f"Report saved to: {report_file}")
        
        # Print summary to console
        print("\nDataset Statistics:")
        print("-" * 70)
        for technique, count in sorted(self.stats.items(), key=lambda x: x[1], reverse=True):
            percentage = (count / len(self.packets)) * 100
            print(f"  {technique:<35} {count:>8,} packets ({percentage:>5.2f}%)")
        print("-" * 70)




In [9]:
def main():
    
    # Configuration
    target_networks = ["192.168.0.0/16", "10.0.0.0/8"]
    output_directory = "stego_dataset"
    
    # Initialize generator
    generator = AdvancedStegoGenerator(
        target_networks=target_networks,
        output_dir=output_directory
    )
    
    # Generate all steganography techniques
    generator.generate_all_techniques()
    
    # Save to PCAP file
    output_file = generator.save_dataset()
    
    print("\nDataset generation complete!")
    print(f"\nOutput files:")
    print(f"PCAP: {output_file}")
    print(f"Report: {os.path.join(output_directory, 'dataset_report.txt')}")
    

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\nGeneration interrupted by user")
        sys.exit(0)
    except Exception as e:
        print(f"\nError: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


Starting comprehensive steganography generation...

[1/10] Generating IP ID Steganography: 10000 packets...
     Generated 11000 packets
[2/10] Generating TTL Covert Channel: 8000 packets...
     Generated 8000 packets
[3/10] Generating TCP ISN Steganography: 12000 packets...
     Generated 14400 packets
[4/10] Generating TCP Timestamp Steganography: 8000 packets...
    Generated 8000 packets
[5/10] Generating Timing Channel: 5000 packets...
     Generated 5000 packets
[6/10] Generating Packet Size Modulation: 10000 packets...
     Generated 10000 packets
[7/10] Generating DNS Tunneling: 8000 packets...
     Generated 10667 packets
[8/10] Generating High-Entropy Payloads: 10000 packets...
     Generated 10000 packets
[9/10] Generating Reserved Bits Steganography: 7000 packets...
     Generated 7000 packets
[10/10] Generating Protocol Anomalies: 8000 packets...
     Generated 8000 packets

 GENERATION COMPLETE
 Total Packets Generated: 92,067
 Time Taken: 30.28 seconds
 Generation Rate