In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import gc

In [15]:
%reset -f

In [18]:
gc.collect()

480

In [3]:
# Read data file
input_file = '../datasets/mawi_packet_trace_original.csv'
packet_trace = pd.read_csv(input_file)

In [24]:
packet_trace['timestamp'].value_counts()

timestamp
1688187656    131519
1688187664    124474
1688187665    122160
1688187680    120930
1688187604    118928
               ...  
1688187651     89117
1688187623     89055
1688187650     86556
1688187600     85774
1688187692     79855
Name: count, Length: 93, dtype: int64

In [4]:
packet_trace.describe()

Unnamed: 0,timestamp,src_port,dst_port,protocol,pkt_size
count,9441776.0,9441776.0,9441776.0,9441776.0,9441776.0
mean,1688188000.0,17936.62,25121.01,8.800011,558.5487
std,27.01577,23002.41,24759.2,8.236224,955.9149
min,1688188000.0,0.0,0.0,1.0,54.0
25%,1688188000.0,266.0,443.0,6.0,58.0
50%,1688188000.0,3478.0,13206.0,6.0,66.0
75%,1688188000.0,41631.0,52529.0,6.0,1051.0
max,1688188000.0,65535.0,65535.0,132.0,20494.0


In [5]:
packet_trace.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9441776 entries, 0 to 9441775
Data columns (total 7 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   timestamp  int64 
 1   src_ip     object
 2   dst_ip     object
 3   src_port   int64 
 4   dst_port   int64 
 5   protocol   int64 
 6   pkt_size   int64 
dtypes: int64(5), object(2)
memory usage: 504.2+ MB


In [20]:
def get_duration(start_time, end_time):
    timestamp1 = datetime.fromtimestamp(start_time)
    timestamp2 = datetime.fromtimestamp(end_time)

    # Calculate the duration
    duration = timestamp2 - timestamp1

    if duration.seconds >= 60:
        duration = f'{int(duration.seconds / 60)} mins'
    else:
        duration = f'{int(duration.seconds)} seconds'
    
    return duration

In [8]:
period = get_duration(packet_trace['timestamp'].iloc[0], packet_trace['timestamp'].iloc[-1])

unique_src_ips = len(packet_trace['src_ip'].unique())
unique_dst_ips = len(packet_trace['dst_ip'].unique())

unique_src_ports = len(packet_trace['src_port'].unique())
unique_dst_ports = len(packet_trace['dst_port'].unique())

unique_protocols = len(packet_trace['protocol'].unique())

print(f'Total Time Period: {period}')
print(f'Unique Source IPs: {unique_src_ips}, Unique Destination IPs: {unique_dst_ips}')
print(f'Unique Source Ports: {unique_src_ports}, Unique Destination Ports: {unique_dst_ports}')
print(f'Unique Protocols: {unique_protocols}')

Total Time Period: 1 mins
Unique Source IPs: 122332, Unique Destination IPs: 351872
Unique Source Ports: 65479, Unique Destination Ports: 64919
Unique Protocols: 9


In [93]:
packet_trace.isnull().sum()

timestamp    0
src_ip       0
dst_ip       0
src_port     0
dst_port     0
protocol     0
pkt_size     0
dtype: int64

In [4]:
def get_flows(data, pkt_size_col = 1, elephant_flow_threshold = 100000):
    # Create a dictionary to store flow information
    flows = {}

    pkt_size_columns = {}
    for i in range(2, pkt_size_col+1):
        pkt_size_columns[f'pkt_size_{i}'] = 0
    
    # Iterate through the dataset
    for index, row in data.iterrows():
        # Extract relevant packet attributes 
        timestamp = row['timestamp']
        src_ip = row['src_ip']
        dst_ip = row['dst_ip']
        src_port = row['src_port']
        dst_port = row['dst_port']
        protocol = row['protocol']
        pkt_size = row['pkt_size']

        if protocol == 6 or protocol == 17:
            # Create a unique key for the flow based on the packet attributes
            flow_key = (src_ip, dst_ip, src_port, dst_port, protocol)
    
            # Check if the flow already exists in the dictionary
            if flow_key in flows:
                pkt_count = flows[flow_key]['flow_pkt_count'] + 1
    
                # multi pkt features
                if pkt_size_col > 1 and pkt_count <= pkt_size_col:
                    # update packet size columns
                    flows[flow_key][f'pkt_size_{pkt_count}'] = pkt_size
    
                    # Total size
                    flows[flow_key][f'{pkt_size_col}_pkt_size'] += pkt_size
    
                    # maximum inter arrival time
                    if flows[flow_key][f'{pkt_size_col}_pkt_max_iat'] < row['timestamp'] - flows[flow_key]['end_time']:
                        flows[flow_key][f'{pkt_size_col}_pkt_max_iat'] = row['timestamp'] - flows[flow_key]['end_time']
    
                    # mean inter arrival time
                    if flows[flow_key]['flow_mean_iat'] > 0:
                        flows[flow_key][f'{pkt_size_col}_pkt_mean_iat'] = np.mean([flows[flow_key]['flow_mean_iat'], (row['timestamp'] - flows[flow_key]['end_time'])])
                    else:
                        flows[flow_key][f'{pkt_size_col}_pkt_mean_iat'] = row['timestamp'] - flows[flow_key]['end_time']
                        
                    # duration
                    flows[flow_key][f'{pkt_size_col}_pkt_duration'] = row['timestamp'] - flows[flow_key]['start_time']
                    
                # flow features
                flows[flow_key]['flow_size'] += pkt_size
                flows[flow_key]['flow_pkt_count'] += 1
    
                # maximum inter arrival time
                if flows[flow_key]['flow_max_iat'] < row['timestamp'] - flows[flow_key]['end_time']:
                    flows[flow_key]['flow_max_iat'] = row['timestamp'] - flows[flow_key]['end_time']
    
                # mean inter arrival time
                if flows[flow_key]['flow_mean_iat'] > 0:
                    flows[flow_key]['flow_mean_iat'] = np.mean([flows[flow_key]['flow_mean_iat'], (row['timestamp'] - flows[flow_key]['end_time'])])                
                else:
                    flows[flow_key]['flow_mean_iat'] = row['timestamp'] - flows[flow_key]['end_time']
                    
                # flow duration
                flows[flow_key]['flow_duration'] = row['timestamp'] - flows[flow_key]['start_time']
    
                # update elephant or mice
                if flows[flow_key]['flow_size'] > elephant_flow_threshold:
                    flows[flow_key]['elephant'] = 1
    
                flows[flow_key]['end_time'] = row['timestamp']
                
            else:
                # Create a new entry for the flow
                if pkt_size_col > 1:
                    flows[flow_key] = {
                        'start_time': timestamp,
                        'end_time': timestamp,
                        'src_ip': src_ip,
                        'dst_ip': dst_ip,
                        'protocol': protocol,
                        'src_port': src_port,
                        'dst_port': dst_port,
                        'pkt_size_1': pkt_size,
                        **pkt_size_columns,
                        f'{pkt_size_col}_pkt_size': pkt_size,
                        f'{pkt_size_col}_pkt_max_iat': 0,
                        f'{pkt_size_col}_pkt_mean_iat': 0,
                        f'{pkt_size_col}_pkt_duration': 0,
                        'flow_size': pkt_size,
                        'flow_pkt_count': 1,
                        'flow_max_iat': 0,
                        'flow_mean_iat': 0,
                        'flow_duration': 0,
                        'elephant': 0,
                    }
    
                else:
                    flows[flow_key] = {
                        'start_time': timestamp,
                        'end_time': timestamp,
                        'src_ip': src_ip,
                        'dst_ip': dst_ip,
                        'protocol': protocol,
                        'src_port': src_port,
                        'dst_port': dst_port,
                        'pkt_size_1': pkt_size,
                        'flow_size': pkt_size,
                        'flow_pkt_count': 1,
                        'flow_max_iat': 0,
                        'flow_mean_iat': 0,
                        'flow_duration': 0,
                        'elephant': 0,
                    }

        
        if index % 100000 == 0:
            print(f'{index} rows processed')
        
    # Output csv file
    # Convert dictionary to dataframe
    data_features = pd.DataFrame.from_dict(flows)

    # Transpose the dataframe
    data_features = data_features.transpose()
    
    # Remove index
#     data_features = data_features.reset_index(drop=True, inplace=True)

    return data_features

In [6]:
flows = get_flows(packet_trace, 7)

# Save dataframe as a csv file without index
output_file = 'flows_4.csv'
flows.to_csv(output_file, index=False)
print('Done')

0 rows processed
100000 rows processed
200000 rows processed
300000 rows processed
400000 rows processed
500000 rows processed
600000 rows processed
700000 rows processed
800000 rows processed
900000 rows processed
1000000 rows processed
1100000 rows processed
1200000 rows processed
1300000 rows processed
1400000 rows processed
1500000 rows processed
1600000 rows processed
1700000 rows processed
1800000 rows processed
1900000 rows processed
2000000 rows processed
2100000 rows processed
2200000 rows processed
2300000 rows processed
2400000 rows processed
2500000 rows processed
2600000 rows processed
2700000 rows processed
2800000 rows processed
2900000 rows processed
3000000 rows processed
3100000 rows processed
3200000 rows processed
3300000 rows processed
3400000 rows processed
3500000 rows processed
3600000 rows processed
3700000 rows processed
3800000 rows processed
3900000 rows processed
4000000 rows processed
4100000 rows processed
4200000 rows processed
4300000 rows processed
440