In [13]:
import pandas as pd
import numpy as np
from datetime import datetime
import gc
import ipaddress

#import sys
#sys.path.append('../implementation')

#from functions import *

In [11]:
# Reset Notebook (If Overloaded)
%reset -f

In [14]:
# Clear Garbage if memory overloaded
gc.collect()

484

In [19]:
# Set dataset directory
d_dir = '../datasets'

In [20]:
# Read data file
input_file = f'{d_dir}/mawi_packet_trace.csv'
packet_trace = pd.read_csv(input_file)

In [21]:
packet_trace.head()

Unnamed: 0,timestamp,src_ip,dst_ip,src_port,dst_port,protocol,pkt_size
0,1688187600,149.40.55.233,203.115.138.41,1999,54524,6,66
1,1688187600,133.188.56.201,91.212.164.184,0,0,1,70
2,1688187600,202.11.248.134,52.113.75.222,50005,3479,17,183
3,1688187600,45.100.248.16,202.249.92.162,44734,443,6,54
4,1688187600,38.88.114.91,163.37.23.124,43332,3128,6,54


In [32]:
packet_trace.describe()

Unnamed: 0,timestamp,src_port,dst_port,protocol,pkt_size
count,9441776.0,9441776.0,9441776.0,9441776.0,9441776.0
mean,1688188000.0,17936.62,25121.01,8.800011,558.5487
std,27.01577,23002.41,24759.2,8.236224,955.9149
min,1688188000.0,0.0,0.0,1.0,54.0
25%,1688188000.0,266.0,443.0,6.0,58.0
50%,1688188000.0,3478.0,13206.0,6.0,66.0
75%,1688188000.0,41631.0,52529.0,6.0,1051.0
max,1688188000.0,65535.0,65535.0,132.0,20494.0


In [33]:
packet_trace.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9441776 entries, 0 to 9441775
Data columns (total 7 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   timestamp  int64 
 1   src_ip     object
 2   dst_ip     object
 3   src_port   int64 
 4   dst_port   int64 
 5   protocol   int64 
 6   pkt_size   int64 
dtypes: int64(5), object(2)
memory usage: 504.2+ MB


In [34]:
def get_duration(start_time, end_time):
    timestamp1 = datetime.fromtimestamp(start_time)
    timestamp2 = datetime.fromtimestamp(end_time)

    # Calculate the duration
    duration = timestamp2 - timestamp1

    if duration.seconds >= 60:
        duration = f'{int(duration.seconds / 60)} mins'
    else:
        duration = f'{int(duration.seconds)} seconds'
    
    return duration

In [35]:
period = get_duration(packet_trace['timestamp'].iloc[0], packet_trace['timestamp'].iloc[-1])

unique_src_ips = len(packet_trace['src_ip'].unique())
unique_dst_ips = len(packet_trace['dst_ip'].unique())

unique_src_ports = len(packet_trace['src_port'].unique())
unique_dst_ports = len(packet_trace['dst_port'].unique())

unique_protocols = len(packet_trace['protocol'].unique())

print(f'Total Time Period: {period}')
print(f'Unique Source IPs: {unique_src_ips}, Unique Destination IPs: {unique_dst_ips}')
print(f'Unique Source Ports: {unique_src_ports}, Unique Destination Ports: {unique_dst_ports}')
print(f'Unique Protocols: {unique_protocols}')

Total Time Period: 1 mins
Unique Source IPs: 122332, Unique Destination IPs: 351872
Unique Source Ports: 65479, Unique Destination Ports: 64919
Unique Protocols: 9


In [36]:
packet_trace.isnull().sum()

timestamp    0
src_ip       0
dst_ip       0
src_port     0
dst_port     0
protocol     0
pkt_size     0
dtype: int64

In [81]:
def get_flows(data, min_timeout = 1, max_timeout = None):
    # Create a dictionary to store flow information
    flows = {}

    # Iterate through the dataset
    for index, row in data.iterrows():
        # Extract relevant packet attributes 
        timestamp = row['timestamp']
        src_ip = row['src_ip']
        dst_ip = row['dst_ip']
        protocol = row['protocol']
        src_port = row['src_port']
        dst_port = row['dst_port']
        pkt_size = row['pkt_size']

        if protocol == 6 or protocol == 17:
            # Create a unique key for the flow based on the packet attributes
            flow_key = (src_ip, dst_ip, src_port, dst_port, protocol)

            # Check if the flow already exists in the dictionary
            if flow_key in flows:
                flows[flow_key]['packet_count'] += 1

                # maximum inter arrival time
                if flows[flow_key]['max_iat'] < row['timestamp'] - flows[flow_key]['end_time']:
                    flows[flow_key]['max_iat'] = row['timestamp'] - flows[flow_key]['end_time']

                # idle timeout
                if flows[flow_key]['max_iat'] < min_timeout:
                    flows[flow_key]['idle_timeout'] = min_timeout

                elif flows[flow_key]['max_iat'] > max_timeout:
                    flows[flow_key]['idle_timeout'] = max_timeout

                else:
                    flows[flow_key]['idle_timeout'] = flows[flow_key]['max_iat']

                # flow duration
                flows[flow_key]['flow_duration'] = row['timestamp'] - flows[flow_key]['start_time']
                
                flows[flow_key]['end_time'] = row['timestamp']
                
            else:
                # Create a new entry for the flow
                flows[flow_key] = {
                    'flow_key': flow_key,
                    'start_time': timestamp,
                    'end_time': timestamp,
                    'source_ip': src_ip,
                    'destination_ip': dst_ip,
                    'source_ip_int': encode_ip(src_ip),
                    'destination_ip_int': encode_ip(dst_ip),
                    'protocol': protocol,
                    'source_port': src_port,
                    'destination_port': dst_port,
                    'first_pkt_size': pkt_size,
                    'max_iat': 0,
                    'last_packet_time': 0,
                    'flow_duration': 0,
                    'packet_count': 1,
                    'idle_timeout': 0,
                }
            
        if index % 100000 == 0:
            print(f'{index} rows processed')
        
    # Output csv file
    # Convert dictionary to dataframe
    data_features = pd.DataFrame.from_dict(flows)

    # Transpose the dataframe
    data_features = data_features.transpose()

    return data_features

In [47]:
# Extract flows
min_timeout = 1
max_timeout = 11

data = extract_features(data, min_timeout, max_timeout)

0 rows processed
100000 rows processed
200000 rows processed
300000 rows processed
400000 rows processed
500000 rows processed
600000 rows processed
700000 rows processed
800000 rows processed
900000 rows processed
1000000 rows processed
1100000 rows processed
1200000 rows processed
1300000 rows processed
1400000 rows processed
1500000 rows processed
1600000 rows processed
1700000 rows processed
1800000 rows processed
1900000 rows processed
2000000 rows processed
2100000 rows processed
2200000 rows processed
2300000 rows processed
2400000 rows processed
2500000 rows processed
2600000 rows processed
2700000 rows processed
2800000 rows processed
2900000 rows processed
3000000 rows processed
3100000 rows processed
3200000 rows processed
3300000 rows processed
3400000 rows processed
3500000 rows processed
3600000 rows processed
3700000 rows processed
3800000 rows processed
3900000 rows processed
4000000 rows processed
4100000 rows processed
4200000 rows processed
4300000 rows processed
440

In [70]:
# Remove timestamp column
flows = flows.drop('flow_key', axis=1)

In [71]:
flows = flows.drop('last_packet_time', axis=1)

In [63]:
def get_flow_class(flow_duration, pkt_count):
    if flow_duration > 11 and pkt_count > 10:
        flow_class = 3
    elif flow_duration <= 2 and pkt_count <= 2:
        flow_class = 1
    else:
        flow_class = 2

    return flow_class

In [73]:
# Adding Flow Class
flows['flow_class'] = np.vectorize(get_flow_class)(flows['flow_duration'], flows['packet_count'])
flows

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,start_time,end_time,source_ip,destination_ip,source_ip_int,destination_ip_int,protocol,source_port,destination_port,first_pkt_size,max_iat,flow_duration,packet_count,packet_rate,flow_class
149.40.55.233,203.115.138.41,1999,54524,6,1688187600,1688187685,149.40.55.233,203.115.138.41,2502440937,3413346857,6,1999,54524,66,19,85,12,0,3
202.11.248.134,52.113.75.222,50005,3479,17,1688187600,1688187692,202.11.248.134,52.113.75.222,3389782150,879840222,17,50005,3479,183,1,92,4697,0,3
45.100.248.16,202.249.92.162,44734,443,6,1688187600,1688187606,45.100.248.16,202.249.92.162,761591824,3405339810,6,44734,443,54,4,6,4,0,2
38.88.114.91,163.37.23.124,43332,3128,6,1688187600,1688187600,38.88.114.91,163.37.23.124,643330651,2737117052,6,43332,3128,54,0,0,1,0,1
163.37.117.124,38.186.128.63,61554,443,6,1688187600,1688187600,163.37.117.124,38.186.128.63,2737141116,649756735,6,61554,443,85,0,0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45.197.187.47,163.37.128.119,48152,3128,6,1688187692,1688187692,45.197.187.47,163.37.128.119,767933231,2737143927,6,48152,3128,58,0,0,1,0,1
167.248.189.103,163.37.93.253,7528,80,6,1688187692,1688187692,167.248.189.103,163.37.93.253,2818096487,2737135101,6,7528,80,58,0,0,1,0,1
202.249.93.185,46.25.179.131,80,12256,6,1688187692,1688187692,202.249.93.185,46.25.179.131,3405340089,773436291,6,80,12256,66,0,0,2,0,1
146.64.20.95,150.161.104.40,44843,2375,6,1688187692,1688187692,146.64.20.95,150.161.104.40,2453673055,2527160360,6,44843,2375,54,0,0,1,0,1


In [96]:
output_file = f'{d_dir}/mawi_flows.csv'

# export data to csv file
flows.to_csv(output_file, index=False)

print('Data Expoted.')

Data Expoted.
