In [23]:
from scapy.all import rdpcap
import pandas as pd

input_pcap_path = '../../data_processed/vc_200/alexa/total3/10/10.pcap'  # Replace with your actual path

# Read packets from the pcap file
packets = rdpcap(input_pcap_path)

# Extract packet times and sizes
times = [pkt.time for pkt in packets]
sizes = [len(pkt) for pkt in packets]

# Normalize the time to start from 0 by subtracting the first timestamp from all timestamps
times = [t - times[0] for t in times]

# Create a DataFrame from the normalized times and sizes
df = pd.DataFrame({'Time': times, 'Size': sizes})

# Display the DataFrame
print(df)


          Time  Size
0     0.000000   462
1     0.000653  1438
2     0.000913  1438
3     0.001166  1438
4     0.001418  1438
..         ...   ...
629  26.582104   158
630  26.582524   206
631  26.693961   158
632  28.291272   142
633  28.346655   158

[634 rows x 2 columns]


In [26]:
import pandas as pd

# Assuming 'df' is the DataFrame from the previous step

# Convert the Time column to a pandas datetime type for resampling
df['Time'] = df['Time'].astype(float)

# Convert the Time column to a pandas timedelta type for resampling
df['Time'] = pd.to_timedelta(df['Time'], unit='s')

# Set Time as the index
df.set_index('Time', inplace=True)

# Resample and sum up the sizes for each interval (0.2 seconds in this example)
interval = '200L'  # 'L' stands for milliseconds
resampled_df = df.resample(interval).sum()

# Reset index to turn the Time back into a column and convert it back to seconds
resampled_df.reset_index(inplace=True)
resampled_df['Time'] = resampled_df['Time'].dt.total_seconds()

# Specify the output CSV file path
output_csv_path = 'output.csv'  # Replace with your desired output CSV file path

# Save to CSV
resampled_df.to_csv(output_csv_path, index=False)

print(f"Resampled data saved to {output_csv_path}")

Resampled data saved to output.csv


In [27]:
from scapy.all import rdpcap
import pandas as pd

def pcap_to_resampled_csv(input_pcap_path, output_csv_path, resample_interval='200L'):
    # Read packets from the pcap file
    packets = rdpcap(input_pcap_path)

    # Extract packet times and sizes
    times = [pkt.time for pkt in packets]
    sizes = [len(pkt) for pkt in packets]

    # Normalize the time to start from 0 by subtracting the first timestamp from all timestamps
    times = [t - times[0] for t in times]

    # Create a DataFrame from the normalized times and sizes
    df = pd.DataFrame({'Time': times, 'Size': sizes})

    # Convert 'Time' to timedelta for resampling
    df['Time'] = pd.to_timedelta(df['Time'].astype(float), unit='s')

    # Set 'Time' as the index
    df.set_index('Time', inplace=True)

    # Resample and sum up the sizes for each interval
    resampled_df = df.resample(resample_interval).sum()

    # Reset index to turn the 'Time' back into a column and convert it back to seconds
    resampled_df.reset_index(inplace=True)
    resampled_df['Time'] = resampled_df['Time'].dt.total_seconds()

    # Save to CSV
    resampled_df.to_csv(output_csv_path, index=False)

    print(f"Resampled data saved to {output_csv_path}")

# Example usage:
pcap_to_resampled_csv('../../data_processed/vc_200/alexa/total3/10/10.pcap', 'output.csv', '200L')


Resampled data saved to output.csv


In [None]:
import os
import pandas as pd
from scapy.all import rdpcap
from concurrent.futures import ThreadPoolExecutor

def resample_pcap(input_pcap_path, output_csv_path, resample_interval='200L'):
    packets = rdpcap(input_pcap_path)
    times = [pkt.time for pkt in packets]
    sizes = [len(pkt) for pkt in packets]
    times = [t - times[0] for t in times]
    
    df = pd.DataFrame({'Time': times, 'Size': sizes})
    df['Time'] = pd.to_timedelta(df['Time'].astype(float), unit='s')
    df.set_index('Time', inplace=True)
    
    resampled_df = df.resample(resample_interval).sum()
    resampled_df.reset_index(inplace=True)
    resampled_df['Time'] = resampled_df['Time'].dt.total_seconds()
    
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
    resampled_df.to_csv(output_csv_path, index=False)

def process_subfolder(root, subfolder, output_root, resample_interval):
    input_folder = os.path.join(root, subfolder)
    output_folder = os.path.join(output_root, subfolder)
    pcap_files = sorted([f for f in os.listdir(input_folder) if f.endswith('.pcap')], key=lambda x: int(x.split('.')[0]))
    
    for file_name in pcap_files:
        input_pcap_path = os.path.join(input_folder, file_name)
        output_csv_path = os.path.join(output_folder, file_name.replace('.pcap', '.csv'))
        resample_pcap(input_pcap_path, output_csv_path, resample_interval)

def parallel_resample(root_folder, output_root_folder, resample_interval='200L'):
    subfolders = sorted([f for f in os.listdir(root_folder) if os.path.isdir(os.path.join(root_folder, f))], key=lambda x: int(x))
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_subfolder, root_folder, subfolder, output_root_folder, resample_interval) for subfolder in subfolders]
        for future in futures:
            future.result()

# Example usage:
parallel_resample('../../data_processed/vc_200/alexa/total3/', '../../data_processed/vc_200/alexa/resampled_02s')
