# Fusion of malicious CSV files 

In [17]:
import pandas as pd
import glob
import os

def filter_and_combine_csv_files(file_paths, output_file):
    data_frames = []
    
    for file in file_paths:
        print(f"Processing: {file}")  # Debugging
        
        # Read CSV with error handling
        try:
            df = pd.read_csv(file, encoding="utf-8", on_bad_lines="skip")
        except Exception as e:
            print(f"Error reading {file}: {e}")
            continue  # Skip this file
        
        # Ensure 'Stage' column exists
        if 'Stage' not in df.columns:
            print(f"Skipping {file}: 'Stage' column missing")
            continue
        
        # Add a new column with the filename (without the full path)
        df["source_file"] = os.path.basename(file)

        # Filter out rows where 'Stage' == "begnign"
        df['Stage'] = df['Stage'].astype(str)
        filtered_df = df[df['Stage'].str.lower() != "benign"]
        
        data_frames.append(filtered_df)
    
    if not data_frames:
        print("No valid data found!")
        return None
    
    # Merge all filtered data
    combined_df = pd.concat(data_frames, ignore_index=True)
    
    # Drop rows with missing values (optional)
    combined_df.dropna(inplace=True)
    
    # Save to CSV
    combined_df.to_csv(output_file, index=False)
    print(f"Combined CSV saved: {output_file}")



In [1]:
import pandas as pd
import glob
import os

def process_filtered_flux_files(base_directory, output_directory):
    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    # Look for all filtered_flux_non_correspondants.csv files in subdirectories
    file_pattern = os.path.join(base_directory, "**/filtered_flux_non_correspondants.csv")
    file_paths = glob.glob(file_pattern, recursive=True)
    
    total_processed = 0
    
    for file in file_paths:
        try:
            print(f"Processing: {file}")
            
            # Read the CSV file
            df = pd.read_csv(file, encoding="utf-8", on_bad_lines="skip")
            
            # Ensure 'Stage' column exists
            if 'Stage' not in df.columns:
                print(f"Skipping {file}: 'Stage' column missing")
                continue
            
            # Get the parent folder name
            parent_folder = os.path.basename(os.path.dirname(file))
            
            # Filter out benign entries (case insensitive)
            df['Stage'] = df['Stage'].astype(str)
            filtered_df = df[df['Stage'].str.lower() != "benign"]
            
            # Skip if no non-benign entries found
            if len(filtered_df) == 0:
                print(f"No non-benign entries found in {parent_folder}")
                continue
            
            # Create output filename
            output_file = os.path.join(output_directory, f"{parent_folder}_non_benign.csv")
            
            # Save to CSV
            filtered_df.to_csv(output_file, index=False)
            print(f"Saved {len(filtered_df)} non-benign entries from {parent_folder} to: {output_file}")
            total_processed += len(filtered_df)
            
        except Exception as e:
            print(f"Error processing {file}: {e}")
            continue
    
    print(f"\nProcessing complete!")
    print(f"Total non-benign entries processed: {total_processed}")
    print(f"Output files saved in: {output_directory}")

# Usage example:
base_dir = "E:\PFE2025\Hamza"  # or provide the full path to the parent directory
output_dir = "E:\PFE2025\Hamza"  # folder where results will be saved
process_filtered_flux_files(base_dir, output_dir)

Processing: E:\PFE2025\Hamza\enp0s3-monday-pvt.pcap_Flow\filtered_flux_non_correspondants.csv
No non-benign entries found in enp0s3-monday-pvt.pcap_Flow
Processing: E:\PFE2025\Hamza\enp0s3-monday.pcap_Flow\filtered_flux_non_correspondants.csv
No non-benign entries found in enp0s3-monday.pcap_Flow
Processing: E:\PFE2025\Hamza\enp0s3-public-thursday.pcap_Flow\filtered_flux_non_correspondants.csv
Saved 137 non-benign entries from enp0s3-public-thursday.pcap_Flow to: E:\PFE2025\Hamza\enp0s3-public-thursday.pcap_Flow_non_benign.csv
Processing: E:\PFE2025\Hamza\enp0s3-public-tuesday.pcap_Flow\filtered_flux_non_correspondants.csv
Saved 11865 non-benign entries from enp0s3-public-tuesday.pcap_Flow to: E:\PFE2025\Hamza\enp0s3-public-tuesday.pcap_Flow_non_benign.csv
Processing: E:\PFE2025\Hamza\enp0s3-public-wednesday.pcap_Flow\filtered_flux_non_correspondants.csv
Saved 8632 non-benign entries from enp0s3-public-wednesday.pcap_Flow to: E:\PFE2025\Hamza\enp0s3-public-wednesday.pcap_Flow_non_benig

In [18]:
directory = "E:\\DAPT\\csv"
file_pattern = os.path.join(directory, "*.csv")
data_files = glob.glob(file_pattern)
print(data_files)
output_file = "E:\\DAPT\\combined_output.csv"
filtered_data = filter_and_combine_csv_files(data_files,output_file)

['E:\\DAPT\\csv\\enp0s3-monday-pvt.pcap_Flow.csv', 'E:\\DAPT\\csv\\enp0s3-monday.pcap_Flow.csv', 'E:\\DAPT\\csv\\enp0s3-public-thursday.pcap_Flow.csv', 'E:\\DAPT\\csv\\enp0s3-public-tuesday.pcap_Flow.csv', 'E:\\DAPT\\csv\\enp0s3-public-wednesday.pcap_Flow.csv', 'E:\\DAPT\\csv\\enp0s3-pvt-thursday.pcap_Flow.csv', 'E:\\DAPT\\csv\\enp0s3-pvt-tuesday.pcap_Flow.csv', 'E:\\DAPT\\csv\\enp0s3-pvt-wednesday.pcap_Flow.csv', 'E:\\DAPT\\csv\\enp0s3-tcpdump-friday.pcap_Flow.csv', 'E:\\DAPT\\csv\\enp0s3-tcpdump-pvt-friday.pcap_Flow.csv']
Processing: E:\DAPT\csv\enp0s3-monday-pvt.pcap_Flow.csv
Processing: E:\DAPT\csv\enp0s3-monday.pcap_Flow.csv
Processing: E:\DAPT\csv\enp0s3-public-thursday.pcap_Flow.csv
Processing: E:\DAPT\csv\enp0s3-public-tuesday.pcap_Flow.csv
Processing: E:\DAPT\csv\enp0s3-public-wednesday.pcap_Flow.csv
Processing: E:\DAPT\csv\enp0s3-pvt-thursday.pcap_Flow.csv
Skipping E:\DAPT\csv\enp0s3-pvt-thursday.pcap_Flow.csv: 'Stage' column missing
Processing: E:\DAPT\csv\enp0s3-pvt-tuesday

In [7]:
import pandas as pd

def clean_csv(input_file, output_file):
    """
    Remove rows containing NaN values from a CSV file and save to a new file.
    
    Parameters:
    input_file (str): Path to input CSV file
    output_file (str): Path to save cleaned CSV file
    
    Returns:
    tuple: (rows_before, rows_after) - Number of rows before and after cleaning
    """
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Store original number of rows
    rows_before = len(df)
    
    # Remove rows with NaN values
    df_cleaned = df.dropna()
    
    # Store number of rows after cleaning
    rows_after = len(df_cleaned)
    
    # Save cleaned data to new CSV file
    df_cleaned.to_csv(output_file, index=False)
    
    return rows_before, rows_after

# Example usage
if __name__ == "__main__":
    input_file = "E:\PFE2025\Dataset\Dataset_1\combined_output.csv"
    output_file = "E:\PFE2025\Dataset\Dataset_1\combined_output.csv"
    
    rows_before, rows_after = clean_csv(input_file, output_file)
    print(f"Rows before cleaning: {rows_before}")
    print(f"Rows after cleaning: {rows_after}")
    print(f"Removed {rows_before - rows_after} rows containing NaN values")

Rows before cleaning: 39293
Rows after cleaning: 39215
Removed 78 rows containing NaN values


# Convert malicious flow to packets


In [3]:
import pandas as pd
import os
from scapy.all import rdpcap, wrpcap
from datetime import datetime

# Paths
csv_file = "E:\\Dataset\\Dataset_1\\combined_output.csv"
pcap_directory = "E:\\DAPT\\pcap-data\\"

# Read CSV file
df = pd.read_csv(csv_file)

# Convert timestamp to UNIX time
def convert_to_unix(timestamp_str):
    try:
        dt_object = datetime.strptime(timestamp_str, "%d/%m/%Y %I:%M:%S %p")  # Handle AM/PM format
        return dt_object.timestamp()
    except ValueError:
        print(f"Invalid timestamp format: {timestamp_str}")
        return None
    

In [11]:
import os
import pandas as pd
from scapy.all import wrpcap
from scapy.utils import PcapReader
from tqdm import tqdm
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from typing import Dict, List, Tuple
from dataclasses import dataclass
from scapy.layers.l2 import Ether
from scapy.packet import Packet

@dataclass
class FlowWindow:
    start_time: float
    end_time: float
    packets: List[Packet]

def convert_to_unix(timestamp_str: str) -> float:
    """Convert timestamp string to Unix timestamp using cached format."""
    try:
        return datetime.strptime(timestamp_str, "%d/%m/%Y %I:%M:%S %p").timestamp()
    except ValueError:
        print(f"Invalid timestamp format: {timestamp_str}")
        return np.nan

def process_pcap_file(args: Tuple[str, pd.DataFrame, str, str]) -> None:
    """Process a single PCAP file and extract flows."""
    pcap_filename, group, pcap_directory, extracted_directory = args
    pcap_path = os.path.join(pcap_directory, pcap_filename)
    
    if not os.path.exists(pcap_path):
        print(f"\nPCAP file not found: {pcap_path}")
        return

    # Pre-allocate flow windows with numpy arrays for faster comparison
    flow_windows: Dict[int, FlowWindow] = {
        idx: FlowWindow(
            start_time=row["Timestamp"],
            end_time=row["end_time"],
            packets=[]
        )
        for idx, row in group.iterrows()
    }

    try:
        with PcapReader(pcap_path) as pcap_reader:
            # Use numpy arrays for faster comparison
            flow_indices = np.array(list(flow_windows.keys()))
            start_times = np.array([fw.start_time for fw in flow_windows.values()])
            end_times = np.array([fw.end_time for fw in flow_windows.values()])

            for pkt in pcap_reader:
                pkt_time = float(pkt.time)
                # Vectorized comparison
                matches = (start_times <= pkt_time) & (pkt_time <= end_times)
                matching_indices = flow_indices[matches]
                
                # Add packet to all matching flows
                for idx in matching_indices:
                    flow_windows[idx].packets.append(pkt)

    except Exception as e:
        print(f"\nError reading packets from {pcap_path}: {e}")
        return

    # Save flows with packets
    base_name = pcap_filename.replace('.pcap', '')
    for idx, flow_window in flow_windows.items():
        if flow_window.packets:
            output_pcap = os.path.join(extracted_directory, f"{base_name}_{idx}.pcap")
            os.makedirs(os.path.dirname(output_pcap), exist_ok=True)
            wrpcap(output_pcap, flow_window.packets)
            print(f"\nSaved {len(flow_window.packets)} packets to {output_pcap}")

def main():
    # Paths
    csv_file = "E:\\Dataset\\Dataset_1\\combined_output.csv"
    pcap_directory = "E:\\DAPT\\pcap-data\\"
    extracted_directory = "E:\\Dataset\\Dataset_1\\extracted\\"

    # Read CSV file with optimized dtypes
    df = pd.read_csv(
        csv_file,
        dtype={
            'source_file': 'category',
            'Flow Duration': 'float64',
            'Timestamp': 'str'
        }
    )

    # Optimize timestamp conversion
    df["Timestamp"] = df["Timestamp"].apply(convert_to_unix)
    df["Flow Duration"] = df["Flow Duration"] / 1e6
    df = df.dropna(subset=["Timestamp", "Flow Duration"])

    # Optimize filename processing
    df["pcap_filename"] = df["source_file"].str.replace("_Flow.csv", "")
    df["end_time"] = df["Timestamp"] + df["Flow Duration"]

    # Group flows
    grouped = df.groupby("pcap_filename")

    # Process PCAP files in parallel
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        args = [(pcap_filename, group, pcap_directory, extracted_directory) 
                for pcap_filename, group in grouped]
        list(tqdm(
            executor.map(process_pcap_file, args),
            total=len(args),
            desc="Processing PCAP files"
        ))

if __name__ == "__main__":
    main()

Processing PCAP files:   0%|          | 0/7 [00:00<?, ?it/s]


Saved 366 packets to E:\Dataset\Dataset_1\extracted\enp0s3-tcpdump-pvt-friday_22974.pcap

Saved 1 packets to E:\Dataset\Dataset_1\extracted\enp0s3-tcpdump-friday_22967.pcap

Saved 296 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_88.pcap

Saved 341 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_89.pcap

Saved 418 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_91.pcap

Saved 375 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_93.pcap

Saved 31 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_96.pcap

Saved 14 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_97.pcap

Saved 14 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_98.pcap

Saved 14 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_99.pcap

Saved 9 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_100.pcap

Saved 21 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_

Processing PCAP files:  14%|█▍        | 1/7 [03:46<22:38, 226.46s/it]


Saved 20 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-tuesday_9402.pcap

Saved 4 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-tuesday_9403.pcap

Saved 46 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-tuesday_9404.pcap

Saved 2 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-tuesday_9405.pcap

Saved 35 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-tuesday_9406.pcap

Saved 20 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-tuesday_9407.pcap

Saved 14 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-tuesday_9408.pcap

Saved 67 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-tuesday_9409.pcap

Saved 7 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-tuesday_9410.pcap

Saved 45 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-tuesday_9411.pcap

Saved 2 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-tuesday_9412.pcap

Saved 2 packets to E:\Dataset\Dataset_1\extracted\enp0s3-public-tuesday_9414.pc

Processing PCAP files:  29%|██▊       | 2/7 [39:17<1:52:14, 1346.97s/it]


Saved 128047 packets to E:\Dataset\Dataset_1\extracted\enp0s3-pvt-thursday_20860.pcap

Saved 574221 packets to E:\Dataset\Dataset_1\extracted\enp0s3-pvt-thursday_20862.pcap

Saved 6 packets to E:\Dataset\Dataset_1\extracted\enp0s3-pvt-thursday_20863.pcap

Saved 5 packets to E:\Dataset\Dataset_1\extracted\enp0s3-pvt-thursday_20864.pcap

Saved 51 packets to E:\Dataset\Dataset_1\extracted\enp0s3-pvt-thursday_20865.pcap

Saved 128 packets to E:\Dataset\Dataset_1\extracted\enp0s3-pvt-thursday_20866.pcap

Saved 7 packets to E:\Dataset\Dataset_1\extracted\enp0s3-pvt-thursday_20867.pcap

Saved 33 packets to E:\Dataset\Dataset_1\extracted\enp0s3-pvt-thursday_20868.pcap

Saved 85 packets to E:\Dataset\Dataset_1\extracted\enp0s3-pvt-thursday_20869.pcap

Saved 144 packets to E:\Dataset\Dataset_1\extracted\enp0s3-pvt-thursday_20870.pcap

Saved 4 packets to E:\Dataset\Dataset_1\extracted\enp0s3-pvt-thursday_20871.pcap

Saved 26 packets to E:\Dataset\Dataset_1\extracted\enp0s3-pvt-thursday_20872.pca

Processing PCAP files: 100%|██████████| 7/7 [3:37:52<00:00, 1867.45s/it]  


# Merge malicious pcap files into one 'Deprecated'


In [3]:
from scapy.all import PcapReader, PcapWriter
import glob
import os

# Define the directory containing the PCAP files and the output file
pcap_directory = "E:\\Dataset\\Dataset_1\\extracted\\"  # Adjust as needed
output_file = "E:\\Dataset\\Dataset_1\\merged_output.pcap"

# Get a list of all PCAP files in the specified directory
pcap_files = glob.glob(os.path.join(pcap_directory, "*.pcap"))

# Create a PcapWriter object for the output file
# Set sync=True to ensure packets are written immediately.
with PcapWriter(output_file, append=False, sync=True) as writer:
    for pcap_file in pcap_files:
        print(f"Processing: {pcap_file}")
        # Use PcapReader to iterate through packets one at a time
        try:
            with PcapReader(pcap_file) as reader:
                for pkt in reader:
                    writer.write(pkt)
        except Exception as e:
            print(f"Error processing {pcap_file}: {e}")

print(f"\nAll PCAP files have been merged into: {output_file}")


Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_100.pcap
Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_101.pcap
Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_103.pcap
Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_105.pcap
Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_106.pcap
Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_107.pcap
Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_108.pcap
Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_109.pcap
Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_110.pcap
Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_111.pcap
Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_112.pcap
Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_113.pcap
Processing: E:\Dataset\Dataset_1\extracted\enp0s3-public-thursday_114.pcap
Processing: E:\Dataset\Da

# Change "Stage" feature value to 1 (malicious)

In [8]:
import pandas as pd

# Specify input and output file paths
input_csv = "E:\PFE2025\Dataset\Dataset_1\combined_output.csv"
output_csv = "E:\PFE2025\Dataset\Dataset_1\combined_output.csv"

# Read the CSV file
df = pd.read_csv(input_csv)

# Set every value in the "Label" column to 1
df["Label"] = 1

# Save the modified DataFrame back to a CSV file
df.to_csv(output_csv, index=False)

print(f"Updated CSV saved as {output_csv}")


Updated CSV saved as E:\PFE2025\Dataset\Dataset_1\combined_output.csv


# Delete pcap files containing only 1 packet

In [1]:
import os
from scapy.utils import PcapReader
from tqdm import tqdm

def count_packets(pcap_path):
    """
    Count the number of packets in a PCAP file.
    Returns -1 if the file is corrupted or unreadable.
    """
    try:
        with PcapReader(pcap_path) as pcap:
            # Read just the first two packets
            first_packet = next(pcap, None)
            if first_packet is None:
                return 0
            second_packet = next(pcap, None)
            if second_packet is None:
                return 1
            return 2  # If we found two packets, no need to count more
            
    except Exception as e:
        print(f"Error reading {pcap_path}: {e}")
        return -1

def cleanup_pcap_folder(folder_path):
    """
    Delete PCAP files containing only one packet from the specified folder.
    Returns statistics about the operation.
    """
    stats = {
        'total_files': 0,
        'deleted_files': 0,
        'error_files': 0,
        'skipped_files': 0
    }
    
    # Get list of all PCAP files
    pcap_files = [f for f in os.listdir(folder_path) 
                  if f.lower().endswith(('.pcap', '.pcapng'))]
    stats['total_files'] = len(pcap_files)
    
    # Process each file with a progress bar
    for filename in tqdm(pcap_files, desc="Processing PCAP files"):
        file_path = os.path.join(folder_path, filename)
        
        # Skip if not a file
        if not os.path.isfile(file_path):
            stats['skipped_files'] += 1
            continue
            
        # Count packets
        packet_count = count_packets(file_path)
        
        if packet_count == 1:
            try:
                os.remove(file_path)
                stats['deleted_files'] += 1
                print(f"\nDeleted: {filename} (1 packet)")
            except Exception as e:
                print(f"\nError deleting {filename}: {e}")
                stats['error_files'] += 1
        elif packet_count == -1:
            stats['error_files'] += 1

    return stats


In [2]:
def main():
    # Get folder path from user
    folder_path = input("Enter the path to the folder containing PCAP files: ").strip()
    
    # Verify folder exists
    if not os.path.isdir(folder_path):
        print("Error: Invalid folder path!")
        return
    
    # Confirm with user
    print(f"\nWARNING: This will delete all PCAP files containing only one packet in:")
    print(f"  {folder_path}")
    confirmation = input("\nDo you want to continue? (yes/no): ").strip().lower()
    
    if confirmation != 'yes':
        print("Operation cancelled.")
        return
    
    # Process the folder
    print("\nStarting cleanup...")
    stats = cleanup_pcap_folder(folder_path)
    
    # Print summary
    print("\nCleanup Summary:")
    print(f"Total PCAP files found: {stats['total_files']}")
    print(f"Files deleted: {stats['deleted_files']}")
    print(f"Files with errors: {stats['error_files']}")
    print(f"Files skipped: {stats['skipped_files']}")
    print(f"Files retained: {stats['total_files'] - stats['deleted_files'] - stats['error_files']}")


main()


  E:\Dataset\Dataset_1\extracted

Starting cleanup...


Processing PCAP files:  13%|█▎        | 90/705 [00:00<00:03, 201.34it/s]


Deleted: enp0s3-public-tuesday_9495.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9496.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9497.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9498.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9499.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9500.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9501.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9502.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9503.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9504.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9505.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9506.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9507.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9508.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9509.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9510.pcap (1 packet)


Processing PCAP files:  16%|█▌        | 113/705 [00:00<00:02, 207.76it/s]


Deleted: enp0s3-public-tuesday_9511.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9513.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9514.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9515.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9516.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9518.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9519.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9520.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9521.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9522.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9523.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9524.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9525.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9526.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9527.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9528.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9529.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9530.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9533.pcap (1 p

Processing PCAP files:  20%|█▉        | 138/705 [00:00<00:02, 219.60it/s]


Deleted: enp0s3-public-tuesday_9539.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9540.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9541.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9542.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9543.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9544.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9545.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9546.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9547.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9548.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9552.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9553.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9554.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9555.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9556.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9557.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9558.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9559.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9560.pcap (1 p

Processing PCAP files:  23%|██▎       | 163/705 [00:00<00:02, 222.87it/s]


Deleted: enp0s3-public-tuesday_9567.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9568.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9569.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9570.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9571.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9572.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9573.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9574.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9575.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9576.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9577.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9578.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9579.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9580.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9581.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9582.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9583.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9584.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9585.pcap (1 p

Processing PCAP files:  27%|██▋       | 190/705 [00:01<00:02, 235.67it/s]


Deleted: enp0s3-public-tuesday_9594.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9595.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9596.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9597.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9598.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9599.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9600.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9601.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9602.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9603.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9604.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9605.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9606.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9607.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9608.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9609.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9610.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9611.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9612.pcap (1 p

Processing PCAP files:  30%|███       | 215/705 [00:01<00:02, 234.93it/s]


Deleted: enp0s3-public-tuesday_9619.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9620.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9621.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9622.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9623.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9624.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9625.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9626.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9627.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9629.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9630.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9631.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9632.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9633.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9637.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9639.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9640.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9641.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9642.pcap (1 p

Processing PCAP files:  35%|███▍      | 245/705 [00:01<00:01, 247.08it/s]


Deleted: enp0s3-public-tuesday_9654.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9655.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9656.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9657.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9658.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9659.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9660.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9661.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9662.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9663.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9664.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9665.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9666.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9667.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9668.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9669.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9672.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9673.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9677.pcap (1 p

Processing PCAP files:  38%|███▊      | 270/705 [00:01<00:01, 231.45it/s]


Deleted: enp0s3-public-tuesday_9791.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9793.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9795.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9796.pcap (1 packet)

Deleted: enp0s3-public-tuesday_9797.pcap (1 packet)


Processing PCAP files:  66%|██████▌   | 465/705 [00:02<00:01, 161.97it/s]


Deleted: enp0s3-pvt-thursday_20999.pcap (1 packet)


Processing PCAP files:  69%|██████▉   | 486/705 [00:02<00:01, 170.60it/s]


Deleted: enp0s3-pvt-thursday_21003.pcap (1 packet)


Processing PCAP files:  82%|████████▏ | 577/705 [00:03<00:00, 174.32it/s]


Deleted: enp0s3-pvt-thursday_21106.pcap (1 packet)


Processing PCAP files:  90%|████████▉ | 634/705 [00:03<00:00, 182.93it/s]


Deleted: enp0s3-pvt-thursday_21187.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21196.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21200.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21205.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21226.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21252.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21293.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21298.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21300.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21362.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21406.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21425.pcap (1 packet)


Processing PCAP files:  93%|█████████▎| 653/705 [00:03<00:00, 158.62it/s]


Deleted: enp0s3-pvt-thursday_21452.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21470.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21488.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21518.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21570.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21601.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21609.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21632.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21663.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21673.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21697.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21739.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21756.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21762.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21780.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21884.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21886.pcap (1 packet)


Processing PCAP files:  95%|█████████▌| 670/705 [00:03<00:00, 161.38it/s]


Deleted: enp0s3-pvt-thursday_21969.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21970.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21973.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_21982.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22112.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22167.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22172.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22242.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22285.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22292.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22334.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22343.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22419.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22428.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22429.pcap (1 packet)


Processing PCAP files:  97%|█████████▋| 687/705 [00:03<00:00, 163.18it/s]


Deleted: enp0s3-pvt-thursday_22487.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22491.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22602.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22617.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22649.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22653.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22674.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22710.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22718.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22720.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22733.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22813.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22830.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22841.pcap (1 packet)

Deleted: enp0s3-pvt-thursday_22844.pcap (1 packet)

Deleted: enp0s3-tcpdump-friday_22967.pcap (1 packet)


Processing PCAP files: 100%|██████████| 705/705 [00:03<00:00, 180.45it/s]



Cleanup Summary:
Total PCAP files found: 705
Files deleted: 239
Files with errors: 0
Files skipped: 0
Files retained: 466


# Change CSV folder "Stage" feature to 1  

In [None]:
import pandas as pd
from pathlib import Path
import os
from tqdm import tqdm
import logging
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, Any

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('csv_processing.log'),
        logging.StreamHandler()
    ]
)

def process_single_csv(args: tuple) -> Dict[str, Any]:
    """Process a single CSV file and update its labels."""
    input_file, output_dir = args
    
    try:
        # Create output filename
        output_file = os.path.join(
            output_dir,
            f"Adv_{os.path.basename(input_file)}"
        )
        
        # Skip if output file already exists
        if os.path.exists(output_file):
            logging.info(f"Skipping {input_file} - output already exists")
            return {
                'file': input_file,
                'status': 'skipped',
                'rows_processed': 0
            }
        
        # Read CSV file
        df = pd.read_csv(input_file)
        initial_rows = len(df)
        
        # Update Label column
        if 'Label' in df.columns:
            df['Label'] = 1
        else:
            logging.warning(f"'Label' column not found in {input_file}")
            return {
                'file': input_file,
                'status': 'warning',
                'rows_processed': 0,
                'error': 'Label column not found'
            }
        
        # Save modified DataFrame
        df.to_csv(output_file, index=False)
        
        return {
            'file': input_file,
            'status': 'success',
            'rows_processed': initial_rows
        }
        
    except Exception as e:
        logging.error(f"Failed to process {input_file}: {e}")
        return {
            'file': input_file,
            'status': 'failed',
            'error': str(e),
            'rows_processed': 0
        }

def process_csv_directory(
    input_dir: str,
    output_dir: str,
    pattern: str = "*_Flow.csv",
    max_workers: int = None
) -> Dict[str, Any]:
    """Process all CSV files in a directory and update their labels."""
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Get list of CSV files
    csv_files = list(Path(input_dir).glob(pattern))
    
    if not csv_files:
        logging.warning(f"No CSV files matching pattern '{pattern}' found in {input_dir}")
        return {
            'total_files': 0,
            'successful': 0,
            'failed': 0,
            'skipped': 0,
            'total_rows_processed': 0
        }
    
    # Process files in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        args = [(str(f), output_dir) for f in csv_files]
        results = list(tqdm(
            executor.map(process_single_csv, args),
            total=len(args),
            desc="Processing CSV files"
        ))
    
    # Compile statistics
    stats = {
        'total_files': len(csv_files),
        'successful': len([r for r in results if r['status'] == 'success']),
        'failed': len([r for r in results if r['status'] == 'failed']),
        'skipped': len([r for r in results if r['status'] == 'skipped']),
        'warnings': len([r for r in results if r['status'] == 'warning']),
        'total_rows_processed': sum(r['rows_processed'] for r in results)
    }
    
    # Log results
    logging.info("\nProcessing Summary:")
    logging.info(f"Total files processed: {stats['total_files']}")
    logging.info(f"Successfully processed: {stats['successful']}")
    logging.info(f"Failed: {stats['failed']}")
    logging.info(f"Skipped: {stats['skipped']}")
    logging.info(f"Files with warnings: {stats['warnings']}")
    logging.info(f"Total rows processed: {stats['total_rows_processed']}")
    
    # Log failures if any
    failures = [r for r in results if r['status'] in ['failed', 'warning']]
    if failures:
        logging.info("\nFailures and Warnings:")
        for f in failures:
            logging.info(f"File: {f['file']}")
            logging.info(f"Error: {f.get('error', 'Unknown error')}")
            logging.info("---")
    
    return stats

def main():
    # Example usage
    input_dir = "E:\\PFE2025\\Dataset\\Dataset_1\\CSV Flow by Flow Adversarial"
    output_dir = "E:\\PFE2025\\Dataset\\Dataset_1\\CSV Flow by Flow Adversarial"
    
    # Process all CSV files ending with _Flow.csv
    stats = process_csv_directory(
        input_dir=input_dir,
        output_dir=output_dir,
        pattern="*_Flow.csv",  # Pattern to match your CSV files
        max_workers=os.cpu_count()  # Use all available CPU cores
    )
    
    # You can use the stats dictionary for further processing if needed
    if stats['failed'] > 0:
        logging.warning(f"⚠️ {stats['failed']} files failed to process")
    if stats['successful'] > 0:
        logging.info(f"✅ Successfully processed {stats['successful']} files")

if __name__ == "__main__":
    main()

# Fuse CSV files into one


In [3]:
import os
import pandas as pd

def merge_csv_files(input_folder, output_file):
    """
    Merges all CSV files in the specified folder into a single CSV file.
    
    :param input_folder: Path to the folder containing CSV files.
    :param output_file: Path to the output merged CSV file.
    """
    csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    
    if not csv_files:
        print("No CSV files found in the directory.")
        return
    
    df_list = [pd.read_csv(os.path.join(input_folder, file)) for file in csv_files]
    merged_df = pd.concat(df_list, ignore_index=True)
    
    merged_df.to_csv(output_file, index=False)
    print(f"Merged {len(csv_files)} CSV files into {output_file}")

# Example usage
input_folder = "E:\\PFE2025\\Dataset\\Dataset_1\\CSV Flow by Flow Adversarial"  # Change this to your folder path
output_file = "E:\\PFE2025\\Dataset\\Dataset_1\\Merged_Adversarial_Flow.csv"  # Change this to your desired output file path
merge_csv_files(input_folder, output_file)


Merged 932 CSV files into E:\PFE2025\Dataset\Dataset_1\Merged_Adversarial_Flow.csv


  merged_df = pd.concat(df_list, ignore_index=True)


In [None]:
import os
import pandas as pd
from scapy.all import wrpcap, IP, TCP, UDP, ICMP
from scapy.utils import PcapReader
from tqdm import tqdm
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from scapy.packet import Packet
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


PCAP_DIR = r"E:\DAPT\pcap-data"

@dataclass
class Flow:
    """Data class to store flow information"""
    flow_id: str
    src_ip: str
    dst_ip: str
    src_port: int
    dst_port: int
    protocol: int
    timestamp: float
    duration: float
    fwd_packets: int
    bwd_packets: int
    pcap_file: str

@dataclass
class FlowStats:
    """Data class to track flow packet statistics"""
    fwd_count: int = 0
    bwd_count: int = 0
    packets: List[Packet] = None

    def __post_init__(self):
        self.packets = []

def convert_to_unix(timestamp_str: str) -> float:
    """Convert timestamp string to Unix timestamp."""
    try:
        dt = datetime.strptime(timestamp_str, "%d/%m/%Y %I:%M:%S %p")
        return dt.timestamp()
    except ValueError as e:
        logger.error(f"Invalid timestamp format: {timestamp_str}, Error: {e}")
        return np.nan

def is_packet_match(
    pkt: Packet,
    flow: Flow,
    pkt_time: float
) -> Tuple[bool, Optional[bool]]:
    """
    Check if packet matches flow criteria.
    Returns (is_match, is_forward) tuple, where is_forward is None if no match.
    """
    try:
        if not (IP in pkt and (TCP in pkt or UDP in pkt or ICMP in pkt)):
            return False, None

        # Time check
        if not (flow.timestamp <= pkt_time <= flow.timestamp + flow.duration):
            return False, None

        ip_layer = pkt[IP]
        
        # Protocol check
        if flow.protocol == 6 and TCP not in pkt:
            return False, None
        elif flow.protocol == 17 and UDP not in pkt:
            return False, None
        elif flow.protocol == 1 and ICMP not in pkt:
            return False, None

        # Forward direction check
        is_forward = (
            ip_layer.src == flow.src_ip and 
            ip_layer.dst == flow.dst_ip
        )
        
        # Backward direction check
        is_backward = (
            ip_layer.src == flow.dst_ip and 
            ip_layer.dst == flow.src_ip
        )

        if not (is_forward or is_backward):
            return False, None

        # Port check for TCP/UDP
        if flow.protocol in (6, 17):
            transport_layer = pkt[TCP] if TCP in pkt else pkt[UDP]
            if is_forward and (transport_layer.sport != flow.src_port or 
                             transport_layer.dport != flow.dst_port):
                return False, None
            elif is_backward and (transport_layer.sport != flow.dst_port or 
                                transport_layer.dport != flow.src_port):
                return False, None

        return True, is_forward

    except Exception as e:
        logger.error(f"Error matching packet: {e}")
        return False, None

def process_pcap_file(args: Tuple[str, List[Flow], str, str]) -> Dict[str, FlowStats]:
    """Process a single PCAP file and extract matching flow packets."""
    pcap_filename, flows, pcap_directory, extracted_directory = args
    pcap_path = os.path.join(pcap_directory, pcap_filename)
    
    if not os.path.exists(pcap_path):
        logger.error(f"PCAP file not found: {pcap_path}")
        return {}

    # Initialize flow statistics
    flow_stats: Dict[str, FlowStats] = {
        flow.flow_id: FlowStats() for flow in flows
    }

    try:
        with PcapReader(pcap_path) as pcap_reader:
            for pkt in pcap_reader:
                pkt_time = float(pkt.time)
                
                # Check packet against each flow
                for flow in flows:
                    is_match, is_forward = is_packet_match(pkt, flow, pkt_time)
                    
                    if is_match:
                        stats = flow_stats[flow.flow_id]
                        if is_forward:
                            stats.fwd_count += 1
                        else:
                            stats.bwd_count += 1
                        stats.packets.append(pkt)

    except Exception as e:
        logger.error(f"Error reading packets from {pcap_path}: {e}")
        return flow_stats

    # Save extracted packets
    for flow in flows:
        stats = flow_stats[flow.flow_id]
        if stats.packets:
            output_path = os.path.join(
                extracted_directory,
                f"flow_{flow.flow_id}_{int(flow.timestamp)}.pcap"
            )
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            
            try:
                wrpcap(output_path, stats.packets)
                logger.info(
                    f"Saved {len(stats.packets)} packets for flow {flow.flow_id}"
                    f" (Fwd: {stats.fwd_count}, Bwd: {stats.bwd_count})"
                )
            except Exception as e:
                logger.error(f"Error saving packets for flow {flow.flow_id}: {e}")

    return flow_stats

def main():
    # Configuration
    csv_file = "E:\\Dataset\\Dataset_1\\combined_output.csv"
    pcap_directory = "E:\\DAPT\\pcap-data\\"
    extracted_directory = "E:\\Dataset\\Dataset_1\\extracted\\"

    # Read CSV file with optimized dtypes
    dtype_dict = {
        'Flow ID': str,
        'Src IP': str,
        'Dst IP': str,
        'Src Port': np.int32,
        'Dst Port': np.int32,
        'Protocol': np.int8,
        'Flow Duration': np.float64,
        'Total Fwd Packet': np.int32,
        'Total Bwd packets': np.int32,
        'source_file': 'category'
    }

    try:
        df = pd.read_csv(csv_file, dtype=dtype_dict)
        logger.info(f"Loaded {len(df)} flows from CSV")

        # Convert timestamps and durations
        df['Timestamp'] = df['Timestamp'].apply(convert_to_unix)
        df['Flow Duration'] = df['Flow Duration'] / 1e6  # Convert to seconds
        df = df.dropna(subset=['Timestamp', 'Flow Duration'])

        # Create Flow objects
        flows_by_pcap: Dict[str, List[Flow]] = {}
        for _, row in df.iterrows():
            pcap_file = row['source_file'].replace('_Flow.csv', '')
            flow = Flow(
                flow_id=row['Flow ID'],
                src_ip=row['Src IP'],
                dst_ip=row['Dst IP'],
                src_port=row['Src Port'],
                dst_port=row['Dst Port'],
                protocol=row['Protocol'],
                timestamp=row['Timestamp'],
                duration=row['Flow Duration'],
                fwd_packets=row['Total Fwd Packet'],
                bwd_packets=row['Total Bwd packets'],
                pcap_file=pcap_file
            )
            flows_by_pcap.setdefault(pcap_file, []).append(flow)

        # Process PCAP files in parallel
        with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
            args = [
                (pcap_file, flows, pcap_directory, extracted_directory)
                for pcap_file, flows in flows_by_pcap.items()
            ]
            
            results = list(tqdm(
                executor.map(process_pcap_file, args),
                total=len(args),
                desc="Processing PCAP files"
            ))

        # Print summary
        total_flows = 0
        total_packets = 0
        for flow_stats in results:
            for flow_id, stats in flow_stats.items():
                if stats.packets:
                    total_flows += 1
                    total_packets += len(stats.packets)

        logger.info(f"\nExtraction complete:")
        logger.info(f"Total flows processed: {total_flows}")
        logger.info(f"Total packets extracted: {total_packets}")

    except Exception as e:
        logger.error(f"Error in main processing: {e}")

if __name__ == "__main__":
    main()

INFO:__main__:Loaded 22979 flows from CSV
Processing PCAP files:   0%|          | 0/7 [00:00<?, ?it/s]

In [1]:
import pandas as pd
from scapy.all import PcapReader, wrpcap
from scapy.layers.inet import IP, TCP, UDP
from datetime import datetime
import os

def parse_timestamp(timestamp_str):
    """Convert timestamp string to UNIX timestamp (seconds since epoch)."""
    dt = datetime.strptime(timestamp_str, "%d/%m/%Y %I:%M:%S %p")
    return dt.timestamp()  # Convert to UNIX timestamp (float)

def packet_matches_flow(packet, flow_info):
    """
    Check if a packet matches all flow constraints:
    - Protocol
    - Source/Destination IPs
    - Source/Destination Ports
    """
    if not (IP in packet and (TCP in packet or UDP in packet)):
        return False
    
    # Get packet protocol
    if TCP in packet:
        packet_proto = '6'  # TCP
    elif UDP in packet:
        packet_proto = '17'  # UDP
    else:
        return False
    
    # Basic protocol check
    if str(packet_proto) != str(flow_info['Protocol']):
        return False
    
    # Check IP addresses (in both directions)
    ip_match = (
        (packet[IP].src == flow_info['Src IP'] and packet[IP].dst == flow_info['Dst IP']) or
        (packet[IP].src == flow_info['Dst IP'] and packet[IP].dst == flow_info['Src IP'])
    )
    
    if not ip_match:
        return False
    
    # Check ports based on protocol
    if TCP in packet:
        port_match = (
            (packet[TCP].sport == int(flow_info['Src Port']) and 
             packet[TCP].dport == int(flow_info['Dst Port'])) or
            (packet[TCP].sport == int(flow_info['Dst Port']) and 
             packet[TCP].dport == int(flow_info['Src Port']))
        )
    elif UDP in packet:
        port_match = (
            (packet[UDP].sport == int(flow_info['Src Port']) and 
             packet[UDP].dport == int(flow_info['Dst Port'])) or
            (packet[UDP].sport == int(flow_info['Dst Port']) and 
             packet[UDP].dport == int(flow_info['Src Port']))
        )
    else:
        port_match = False
    
    return port_match

def analyze_flow(flow_row, pcap_base_dir):
    """
    Analyze a single flow:
    1. Extract time window information
    2. Read corresponding PCAP file
    3. Match packets based on time and constraints
    """
    flow_id = flow_row['Flow ID']
    print(f"\nProcessing flow: {flow_id}")
    
    # Convert timestamp and duration to UNIX timestamp format
    flow_start = parse_timestamp(flow_row['Timestamp'])  # In seconds (float)
    flow_duration = float(flow_row['Flow Duration']) / 1_000_000  # Convert microseconds to seconds
    flow_end = flow_start + flow_duration  # End time in UNIX timestamp
    
    print(f"Time window: {flow_start} to {flow_end} (UNIX timestamp)")
    
    # Get PCAP filename from the last column
    pcap_filename = flow_row.iloc[-1]
    pcap_path = os.path.join(pcap_base_dir, pcap_filename.replace('_Flow.csv', ''))
    
    if not os.path.exists(pcap_path):
        print(f"Warning: PCAP file not found: {pcap_path}")
        return []
    
    matching_packets = []
    forward_packets = 0
    backward_packets = 0
    
    try:
        print(f"Reading PCAP file: {pcap_path}")
        with PcapReader(pcap_path) as pcap_reader:
            for packet in pcap_reader:
                packet_time = float(packet.time)  # Already in UNIX timestamp format
                
                # First check time window
                if flow_start <= packet_time <= flow_end:
                    # Then check other constraints
                    if packet_matches_flow(packet, flow_row):
                        matching_packets.append(packet)
                        
                        # Count forward/backward packets
                        if IP in packet and packet[IP].src == flow_row['Src IP']:
                            forward_packets += 1
                        else:
                            backward_packets += 1
        
        # Write matching packets to output file
        if matching_packets:
            output_file = f"matched_{flow_id}.pcap"
            wrpcap(output_file, matching_packets)
            print(f"Created {output_file} with {len(matching_packets)} packets")
            
            # Validate packet counts
            print("\nPacket count validation:")
            print(f"Forward packets found: {forward_packets} (Expected: {flow_row['Total Fwd Packet']})")
            print(f"Backward packets found: {backward_packets} (Expected: {flow_row['Total Bwd packets']})")
            
            if forward_packets != int(flow_row['Total Fwd Packet']) or \
               backward_packets != int(flow_row['Total Bwd packets']):
                print("Warning: Packet count mismatch!")
        else:
            print("No matching packets found for this flow")
            
    except Exception as e:
        print(f"Error processing PCAP file: {str(e)}")
    
    return matching_packets

def main(csv_file, pcap_base_dir):
    """Main function to process all flows."""
    print(f"Reading flows from {csv_file}")
    df = pd.read_csv(csv_file)
    
    all_matching_packets = {}
    
    for _, flow in df.iterrows():
        matching_packets = analyze_flow(flow, pcap_base_dir)
        all_matching_packets[flow['Flow ID']] = matching_packets
    
    # Print summary
    print("\nProcessing complete!")
    print(f"Processed {len(df)} flows")
    print(f"Created {len([v for v in all_matching_packets.values() if v])} output PCAP files")
    
    return all_matching_packets

if __name__ == "__main__":
    # Example usage
    csv_file ="E:\\PFE2025\\Dataset\\Dataset_1\\combined_output.csv"
    pcap_base_dir = "E:\\DAPT\\pcap-data"
    matching_packets = main(csv_file, pcap_base_dir)


Reading flows from E:\PFE2025\Dataset\Dataset_1\combined_output.csv

Processing flow: 192.168.3.29-209.147.138.38-9000-56646-6
Time window: 1563456127.0 to 1563456133.878549 (UNIX timestamp)
Reading PCAP file: E:\DAPT\pcap-data\enp0s3-public-thursday.pcap
No matching packets found for this flow

Processing flow: 192.168.3.29-209.147.138.38-9000-56646-6
Time window: 1563456134.0 to 1563456134.000017 (UNIX timestamp)
Reading PCAP file: E:\DAPT\pcap-data\enp0s3-public-thursday.pcap
No matching packets found for this flow

Processing flow: 192.168.3.29-209.147.138.38-9000-56656-6
Time window: 1563456129.0 to 1563456134.044341 (UNIX timestamp)
Reading PCAP file: E:\DAPT\pcap-data\enp0s3-public-thursday.pcap
No matching packets found for this flow

Processing flow: 192.168.3.29-209.147.138.38-9000-56653-6
Time window: 1563456129.0 to 1563456134.049017 (UNIX timestamp)
Reading PCAP file: E:\DAPT\pcap-data\enp0s3-public-thursday.pcap
No matching packets found for this flow

Processing flow: 19

KeyboardInterrupt: 

In [None]:
import csv
from datetime import datetime, timedelta
import os
from scapy.all import *
from scapy.utils import PcapWriter
from tqdm import tqdm

PCAP_DIR = "E:\\DAPT\\pcap-data\\"

def sanitize_filename(name):
    # Remove "_Flow.csv" from the end of the filename if it exists
    return "".join(c if c.isalnum() or c in ('_', '-') else '_' for c in name)

def process_flow(row, output_dir):
    try:
        # Extract relevant fields from CSV row
        flow_id = row[0]
        src_ip = row[1]
        src_port = int(row[2])
        dst_ip = row[3]
        dst_port = int(row[4])
        protocol = int(row[5])
        timestamp_str = row[6]
        flow_duration = float(row[7])  # in microseconds
        pcap_file = row[-1]
        pcap_file = pcap_file[:-9]
        pcap_file = os.path.join(PCAP_DIR, pcap_file)

        # Parse timestamp and adjust for 1-hour shift
        datetime_obj = datetime.strptime(timestamp_str, "%d/%m/%Y %I:%M:%S %p")
        # Add one hour to compensate for PCAP being 1 hour behind
        datetime_obj = datetime_obj - timedelta(hours=1)
        start_time = datetime_obj.timestamp()
        
        # Convert flow duration to seconds
        duration_seconds = flow_duration / 1000000.0  # Convert microseconds to seconds
        end_time = start_time + duration_seconds

        # Check if we need to validate ports
        check_ports = protocol in (6, 17)  # TCP or UDP

        # Read pcap file
        if not os.path.exists(pcap_file):
            print(f"PCAP file not found: {pcap_file}")
            return None

        matching_packets = []
        with PcapReader(pcap_file) as pcap_reader:
            for pkt in tqdm(pcap_reader, desc=f"Processing packets", leave=False):
                try:
                    # Check timestamp first for efficiency
                    pkt_time = pkt.time
                    # Now the PCAP timestamp should align with our adjusted CSV timestamp
                    if not (start_time <= pkt_time <= end_time):
                        continue

                    # Check IP layer
                    if not pkt.haslayer(IP):
                        continue
                    ip = pkt[IP]

                    # Check protocol
                    if ip.proto != protocol:
                        continue

                    # Check IP addresses (both directions)
                    if not ((ip.src == src_ip and ip.dst == dst_ip) or 
                            (ip.src == dst_ip and ip.dst == src_ip)):
                        continue

                    # Check ports if applicable
                    if check_ports:
                        if protocol == 6 and pkt.haslayer(TCP):
                            tcp = pkt[TCP]
                            sport = tcp.sport
                            dport = tcp.dport
                        elif protocol == 17 and pkt.haslayer(UDP):
                            udp = pkt[UDP]
                            sport = udp.sport
                            dport = udp.dport
                        else:
                            continue  # Protocol mismatch

                        # Check ports (both directions)
                        if not ((sport == src_port and dport == dst_port) or 
                                (sport == dst_port and dport == src_port)):
                            continue

                    matching_packets.append(pkt)
                except Exception as e:
                    continue

        # Create output directory if needed
        os.makedirs(output_dir, exist_ok=True)

        # Write matching packets to new pcap
        if matching_packets:
            sanitized_id = sanitize_filename(flow_id)
            output_file = os.path.join(output_dir, f"flow_{sanitized_id}.pcap")
            wrpcap(output_file, matching_packets)
            return output_file
        return None

    except Exception as e:
        print(f"Error processing flow {flow_id}: {str(e)}")
        return None

def main(csv_path, output_dir="output_pcaps"):
    results = []
    with open(csv_path, 'r') as f:
        reader = csv.reader(f)
        headers = next(reader)
        total_rows = sum(1 for _ in reader)
        f.seek(0)
        next(reader)  # Skip header again

        for row in tqdm(reader, total=total_rows, desc="Processing flows"):
            result = process_flow(row, output_dir)
            if result:
                results.append(result)

    print("\nProcessing complete. Generated PCAP files:")
    for res in results:
        print(f"- {res}")

if __name__ == "__main__":

    csv_path = "E:\\PFE2025\\Dataset\\Dataset_1\\combined_output.csv"
    
    main(csv_path)

Processing flows:   0%|          | 0/39215 [00:00<?, ?it/s]
Processing packets: 0it [00:00, ?it/s]
Processing packets: 546it [00:00, 5278.70it/s]
Processing packets: 1167it [00:00, 5742.22it/s]
Processing packets: 1906it [00:00, 6462.84it/s]
Processing packets: 2554it [00:00, 6457.23it/s]
Processing packets: 3201it [00:00, 6279.36it/s]
Processing packets: 3917it [00:00, 6488.63it/s]
Processing packets: 4567it [00:00, 6395.27it/s]
Processing packets: 5304it [00:00, 6521.26it/s]
Processing packets: 5957it [00:00, 6387.93it/s]
Processing packets: 6687it [00:01, 6415.50it/s]
Processing packets: 7436it [00:01, 6721.75it/s]
Processing packets: 8110it [00:01, 6403.70it/s]
Processing packets: 8783it [00:01, 6296.44it/s]
Processing packets: 9449it [00:01, 6352.02it/s]
Processing packets: 10087it [00:01, 6316.45it/s]
Processing packets: 10843it [00:01, 6420.36it/s]
Processing packets: 11486it [00:01, 6375.39it/s]
Processing packets: 12124it [00:01, 6371.71it/s]
Processing packets: 12792it [00:02

KeyboardInterrupt: 


Starting packet extraction process...
Reading CSV file...
Reading PCAP file...
Total packets in PCAP: 1566798

Processing packets...


Analyzing packets:   1%|          | 9170/1566798 [17:05<48:23:20,  8.94packet/s] 


KeyboardInterrupt: 

# CSV Matcher


In [3]:
import pandas as pd
import ipaddress
import os

def normalize_flow(row):
    """Normalize flow direction by always putting the lower IP address as source"""
    try:
        src_ip = ipaddress.ip_address(row['Src IP'])
        dst_ip = ipaddress.ip_address(row['Dst IP'])
        
        if src_ip > dst_ip:
            # Swap source and destination
            return {
                'Src IP': str(dst_ip),
                'Dst IP': str(src_ip),
                'Src Port': row['Dst Port'],
                'Dst Port': row['Src Port'],
                'Protocol': row['Protocol']
            }
    except:
        pass
    
    return {
        'Src IP': row['Src IP'],
        'Dst IP': row['Dst IP'],
        'Src Port': row['Src Port'],
        'Dst Port': row['Dst Port'],
        'Protocol': row['Protocol']
    }

def create_flow_key(row):
    """Create a unique key for a flow based on its 5-tuple"""
    normalized = normalize_flow(row)
    return (
        f"{normalized['Src IP']}_{normalized['Src Port']}_"
        f"{normalized['Dst IP']}_{normalized['Dst Port']}_{normalized['Protocol']}"
    )

def aggregate_flows(df):
    """Aggregate split flows into single entries"""
    # Create flow keys
    df['flow_key'] = df.apply(create_flow_key, axis=1)
    
    # Group by flow key and aggregate
    aggregated = df.groupby('flow_key').agg({
        'Src IP': 'first',
        'Dst IP': 'first',
        'Src Port': 'first',
        'Dst Port': 'first',
        'Protocol': 'first'
    }).reset_index()
    
    return aggregated

def find_matching_flows(csv1_path, csv2_path, output_dir):
    """Find matching flows between two CSV files and save detailed output"""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read CSV files
    print(f"Reading CSV files...")
    df1 = pd.read_csv(csv1_path)
    df2 = pd.read_csv(csv2_path)
    
    print(f"File 1 shape: {df1.shape}")
    print(f"File 2 shape: {df2.shape}")
    
    # Ensure column names are consistent
    required_columns = ['Src IP', 'Dst IP', 'Src Port', 'Dst Port', 'Protocol']
    for col in required_columns:
        if col not in df1.columns or col not in df2.columns:
            raise ValueError(f"Missing required column: {col}")
    
    # Aggregate split flows in both dataframes
    print("Aggregating flows...")
    df1_agg = aggregate_flows(df1)
    df2_agg = aggregate_flows(df2)
    
    # Find matching flows
    print("Finding matching flows...")
    matching_flows = pd.merge(
        df1_agg,
        df2_agg,
        on=['flow_key'],
        suffixes=('_v1', '_v2')
    )
    
    # Count statistics
    total_flows_v1 = len(df1_agg)
    total_flows_v2 = len(df2_agg)
    matching_count = len(matching_flows)
    
    # Save detailed outputs
    print("\nSaving output files...")
    
    # 1. Save statistics to a text file
    stats_path = os.path.join(output_dir, "matching_statistics.txt")
    with open(stats_path, 'w') as f:
        f.write("Flow Matching Statistics\n")
        f.write("=======================\n\n")
        f.write(f"CSV File 1: {csv1_path}\n")
        f.write(f"CSV File 2: {csv2_path}\n\n")
        f.write(f"Total flows in version 1: {total_flows_v1}\n")
        f.write(f"Total flows in version 2: {total_flows_v2}\n")
        f.write(f"Matching flows: {matching_count}\n")
        f.write(f"Match percentage: {(matching_count/min(total_flows_v1, total_flows_v2))*100:.2f}%\n")
    
    # 2. Save matching flows data
    # Get original rows from both files that match
    df1['flow_key'] = df1.apply(create_flow_key, axis=1)
    df2['flow_key'] = df2.apply(create_flow_key, axis=1)
    
    matching_flows_v1 = df1[df1['flow_key'].isin(matching_flows['flow_key'])]
    matching_flows_v2 = df2[df2['flow_key'].isin(matching_flows['flow_key'])]
    
    # Save the matching flows
    matching_flows_v1[required_columns].to_csv(
        os.path.join(output_dir, "matching_flows_version1.csv"), 
        index=False
    )
    matching_flows_v2[required_columns].to_csv(
        os.path.join(output_dir, "matching_flows_version2.csv"), 
        index=False
    )
    
    # Print summary
    print(f"\nFlow Matching Statistics:")
    print(f"Total flows in version 1: {total_flows_v1}")
    print(f"Total flows in version 2: {total_flows_v2}")
    print(f"Matching flows: {matching_count}")
    print(f"Match percentage: {(matching_count/min(total_flows_v1, total_flows_v2))*100:.2f}%")
    
    print(f"\nOutput files saved in: {output_dir}")
    print(f"1. Statistics: matching_statistics.txt")
    print(f"2. Original matching rows from version 1: matching_flows_version1.csv")
    print(f"3. Original matching rows from version 2: matching_flows_version2.csv")
    
    return matching_flows

def main():
    # Example usage
    csv1_path = "/media/brahim/New Volume/PFE2025/Hamza/enp0s3-public-tuesday.pcap_Flow/enp0s3-public-tuesday.pcap_Flow_non_benign.csv"
    csv2_path = "/media/brahim/New Volume/PFE2025/Hamza/enp0s3-public-tuesday.pcap_Flow/extracted_MALICIOUS_packets.pcap_Flow.csv"
    output_dir = "/media/brahim/New Volume/PFE2025/Hamza/enp0s3-public-tuesday.pcap_Flow/"
    
    
    try:
        matching_flows = find_matching_flows(csv1_path, csv2_path, output_dir)
        
        # Display sample of matching flows
        print("\nSample of matching flows:")
        print(matching_flows.head())
        
    except Exception as e:
        print(f"Error processing files: {str(e)}")

if __name__ == "__main__":
    main()


Reading CSV files...
File 1 shape: (11865, 84)
File 2 shape: (13752, 84)
Aggregating flows...
Finding matching flows...

Saving output files...

Flow Matching Statistics:
Total flows in version 1: 6320
Total flows in version 2: 6296
Matching flows: 6296
Match percentage: 100.00%

Output files saved in: /media/brahim/New Volume/PFE2025/Hamza/enp0s3-public-tuesday.pcap_Flow/
1. Statistics: matching_statistics.txt
2. Original matching rows from version 1: matching_flows_version1.csv
3. Original matching rows from version 2: matching_flows_version2.csv

Sample of matching flows:
                                  flow_key      Src IP_v1     Dst IP_v1  \
0  184.98.36.245_44662_192.168.3.29_9000_6  184.98.36.245  192.168.3.29   
1  184.98.36.245_44663_192.168.3.29_9001_6  184.98.36.245  192.168.3.29   
2  184.98.36.245_44664_192.168.3.29_9002_6  184.98.36.245  192.168.3.29   
3    184.98.36.245_44665_192.168.3.29_80_6  184.98.36.245  192.168.3.29   
4  184.98.36.245_44688_192.168.3.29_9001_6 