In [2]:
import numpy as np
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import pandas as pd
from scapy.all import *
from scapy.layers.l2 import Ether
from scapy.layers.inet import IP, UDP, TCP
from scapy.contrib.igmp import IGMP
from scapy.utils import PcapWriter

import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import csv

import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
import os
from datetime import datetime

## Progress bar
from tqdm.notebook import tqdm
from tqdm import trange
from time import sleep

## Parsing pcap file 

In [3]:
pcap_fp = '../../../Trace/Mawi/202104140000.pcap'
time_interval = 20 # seconds
###### Get the timestamp of the first packet in the pcap file
def get_pkt_ts(pcap_fp):
    pkt = rdpcap(pcap_fp, 1) # read 1st  pkt
    return float(pkt[0].time)
############################################
pcap_name = os.path.basename(pcap_fp).split('.')[0]
ts = get_pkt_ts(pcap_fp)
ts_str = datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')

print('Pcap file: {} 1st pkt timestamp:'.format(pcap_name), ts_str)

Pcap file: 202104140000 1st pkt timestamp: 2021-04-13 23:00:00
 2021-04-13 23:00:00


In [12]:
sip_dict = Counter()
dip_dict = Counter()

start_time = None 

def write_csv(pcap_name , current_time, sip_dict, dip_dict):
    global time_interval
    ### define csv file header
    fieldnames = ['IP', 'Flow len','Flow size']
    folder = ['Src', 'Dst']
    for i in folder:
        os.makedirs('./{}/{}/Obs_{}s'.format(pcap_name, i, time_interval), exist_ok=True)
    
    ### Save sip dict to csv file
    with open('./{}/{}/Obs_{}s/{}s.csv'.format(pcap_name, folder[0], time_interval,current_time), 'w') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        for key, value in sip_dict.items():
            writer.writerow({'IP': key, 'Flow len': value['flow_len'], 'Flow size': value['flow_size']})
    
    ## Save dip dict to csv file
    with open ('./{}/{}/Obs_{}s/{}s.csv'.format(pcap_name, folder[1], time_interval,current_time), 'w') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        for key, value in dip_dict.items():
            writer.writerow({'IP': key, 'Flow len': value['flow_len'], 'Flow size': value['flow_size']})
            
def extract_pkt_info(packet):
    global start_time
    global time_interval
    if IP in packet:
        pkt_time = float(packet.time)
        ip_src = packet[IP].src
        ip_dst = packet[IP].dst
        ### ip_dict data structure ###
        # {ip_src: {'flow_len': 1, 'flow_size': packet[IP].len}}
        
        ### Check if exceed the observation time ###
        if (pkt_time - start_time) < time_interval :
            if ip_src in sip_dict :
                ### IP src exists in the dict ###
                ### Update the flow length and size ###
                sip_dict[ip_src]['flow_len'] += 1               
                sip_dict[ip_src]['flow_size'] += packet[IP].len
            else:
                ## IP src does not exist in the dict ###
                ## Add the IP src to the dict ###
                sip_dict[ip_src] = {'flow_len': 1, 'flow_size': packet[IP].len}

            if ip_dst in dip_dict :
                dip_dict[ip_dst]['flow_len'] += 1
                dip_dict[ip_dst]['flow_size'] += packet[IP].len
            else: 
                dip_dict[ip_dst] = {'flow_len': 1, 'flow_size': packet[IP].len}
        else:
            print("Observation Interval: from {} to {}".format(start_time, pkt_time))
            current_time = float(packet.time)
            
            write_csv(pcap_name ,current_time, sip_dict, dip_dict)
            start_time = current_time
            dip_dict.clear() , sip_dict.clear()
    return sip_dict, dip_dict, start_time

def pkt_handler(packet):
    extract_pkt_info(packet)
            
start_time = get_pkt_ts(pcap_fp)

### Using offline mode to read pcap file ###
### wihout storing packets in memory ###
sniff(offline=pcap_fp, prn=lambda x : pkt_handler(x), store=0)

<Sniffed: TCP:0 UDP:0 ICMP:0 Other:0>

## Save Csv data

In [142]:
### Get top 200 flow len and size #####
### Reason of merely using 200 since the data is heavy tail distribution ####
### Contain lot of flows with small size and length which take heavy burden for ploting ####
def ori_get_flow_size(data):
    len_df =  pd.DataFrame(Counter(data['flow len']).most_common(200), columns=['Flow len', 'Count'])
    size_df = pd.DataFrame(Counter(data['flow size']).most_common(200), columns=['Flow size', 'Count'])
    return len_df , size_df
# def get_flow_size(data):
#     len_df =  pd.DataFrame(Counter(data['flow len']).most_common(), columns=['Flow len', 'Count'])
#     size_df = pd.DataFrame(Counter(data['flow size']).most_common(), columns=['Flow size', 'Count'])
#     return len_df , size_df


def get_flow_size(data):
    # sorted(Counter(data['flow len']).items() , key = lambda x: x[0])
    len_df = pd.DataFrame(
        sorted(Counter(data['flow len']).items() , key = lambda x: x[0]), columns=['Flow len', 'Count']
    )
    size_df = pd.DataFrame(
        sorted(Counter(data['flow size']).items() , key = lambda x: x[0]), columns=['Flow size', 'Count']
    )
    return len_df , size_df

### Get flow count and index ####

## example for flow size,
### index_df data structure ###
# | 1st obs time flow len | 2nd obs time flow len | 3rd obs time flow len | ...
# | 32                   | 1500                   | 1500                 | ...
# | 64                  | 1378                     | 1378                   | ...

### count_df data structure ###
# | 1st obs time count | 2nd obs time count | 3rd obs time count | ...
# | 4843               | 3136                  | 3198           | ...
# | 963               | 901                  | 867
def get_flow_count_index(df):
    count_df = pd.DataFrame()
    index_df = pd.DataFrame()    
    for i in range(len(df.columns) // 2):
        count_df = pd.concat([count_df, df.iloc[:, i*2+1]], axis=1)
        index_df = pd.concat([index_df, df.iloc[:, i*2]], axis=1)
    return count_df, index_df

flow_len_df , flow_size_df = pd.DataFrame(), pd.DataFrame()
fp = './{}/'.format(pcap_name)
for i, folder in enumerate(os.listdir(fp)): # src or dst
    if (folder.startswith('Src') or folder.startswith('Dst')) and os.path.isdir(fp+folder):
        for subfolder in os.listdir(fp+folder): # obs interval
            # if 'Obs' in subfolder:
                obs_itl = subfolder.split('_')[1]
                file_list = sorted(os.listdir(fp+folder+'/'+subfolder))
                print('Processing:', folder, subfolder)
                # print(file_list)
                for i in file_list:
                    data = pd.read_csv(fp+folder+'/'+subfolder+'/'+i ,
                                    header=None, usecols=[1,2] ,) # only load flow size and flow len 
                    data = data.rename(columns={1: 'flow len', 2: 'flow size'})
                    flow_len_df = pd.concat([flow_len_df, get_flow_size(data)[0]], axis=1)
                    flow_size_df = pd.concat([flow_size_df, get_flow_size(data)[1]], axis=1)
                    
                    flow_len_count_df , flow_len_index_df = get_flow_count_index(flow_len_df)
                    flow_size_count_df , flow_size_index_df = get_flow_count_index(flow_size_df)
                    
                    os.makedirs('./{}/{}/{}/Obs_{}'.format(pcap_name, '3d plot', folder,obs_itl), exist_ok=True)
                    flow_len_count_df.to_csv('./{}/{}/{}/Obs_{}/flow_len_count.csv'.format(pcap_name, '3d plot',folder, obs_itl))
                    flow_len_index_df.to_csv('./{}/{}/{}/Obs_{}/flow_len_index.csv'.format(pcap_name, '3d plot',folder, obs_itl))
                    flow_size_count_df.to_csv('./{}/{}/{}/Obs_{}/flow_size_count.csv'.format(pcap_name, '3d plot',folder, obs_itl))
                    flow_size_index_df.to_csv('./{}/{}/{}/Obs_{}/flow_size_index.csv'.format(pcap_name, '3d plot',folder, obs_itl))
                # break


Processing: Dst Obs_10s
 Dst Obs_10s
Processing: Dst Obs_20s
Processing: Src Obs_10s
Processing: Src Obs_20s


## Plot 

### Directly read csv file from subdirectory code 

In [119]:
flow_len_count_df = pd.read_csv('./202104140000/3d plot/Dst/Obs_20s/flow_len_count.csv').drop(columns='Unnamed: 0')
flow_len_index_df = pd.read_csv('./202104140000/3d plot/Dst/Obs_20s/flow_len_index.csv').drop(columns='Unnamed: 0')
flow_size_count_df = pd.read_csv('./202104140000/3d plot/Dst/Obs_20s/flow_size_count.csv').drop(columns='Unnamed: 0')
flow_size_index_df = pd.read_csv('./202104140000/3d plot/Dst/Obs_20s/flow_size_index.csv').drop(columns='Unnamed: 0')

### Plot 3d plot 

In [131]:
flow_len_count_df.tail()

Unnamed: 0,Count,Count.1,Count.2,Count.3,Count.4,Count.5,Count.6,Count.7,Count.8,Count.9,...,Count.97,Count.98,Count.99,Count.100,Count.101,Count.102,Count.103,Count.104,Count.105,Count.106
414,,,,,,,,,,,...,,,1.0,,,,,,,
415,,,,,,,,,,,...,,,1.0,,,,,,,
416,,,,,,,,,,,...,,,1.0,,,,,,,
417,,,,,,,,,,,...,,,1.0,,,,,,,
418,,,,,,,,,,,...,,,,,,,,,,


In [132]:
flow_len_index_df.tail()

Unnamed: 0,Flow len,Flow len.1,Flow len.2,Flow len.3,Flow len.4,Flow len.5,Flow len.6,Flow len.7,Flow len.8,Flow len.9,...,Flow len.97,Flow len.98,Flow len.99,Flow len.100,Flow len.101,Flow len.102,Flow len.103,Flow len.104,Flow len.105,Flow len.106
414,,,,,,,,,,,...,,,88167.0,,,,,,,
415,,,,,,,,,,,...,,,110198.0,,,,,,,
416,,,,,,,,,,,...,,,115710.0,,,,,,,
417,,,,,,,,,,,...,,,125296.0,,,,,,,
418,,,,,,,,,,,...,,,,,,,,,,


In [121]:
import matplotlib.colors as mcolors
########## 3D plot ##########
#############################
def fill_na_with_median(row):
    median_val = row.median()
    return row.fillna(median_val)
k = 5000
plot_df = flow_len_count_df.fillna(0)
# plot_df = flow_len_count_df
plot_df2 = flow_len_index_df.apply( lambda x: fill_na_with_median(x), axis=1)

#### Plot ####

def create_colorscale(start_rgb, end_rgb, n):
    return [[i/(n-1), mcolors.to_hex((start_rgb + i*(end_rgb - start_rgb)/(n-1)).clip(0,1))]
            for i in range(n)]
    
log_plot_df = np.log10(plot_df + 1 )
start_rgb = np.array([173, 216, 230])/255  # lightblue
end_rgb = np.array([0, 0, 139])/255        # darkblue
custom_colorscale= create_colorscale(start_rgb, end_rgb, 200)

fig = go.Figure(data=[go.Surface(y=plot_df2.values, z=log_plot_df.values, colorscale=custom_colorscale,
                                        colorbar=dict(
                                            title="Flow Count (log scale)",
                                            tickvals=[0, 1, 2, 3, 4, 5],
                                            ticktext=["1", "10", "100", "1K", "10K", "100K"],
                                            lenmode="fraction",
                                        len=0.75,
                                            xpad=40
                                        ))])
fig.update_layout(title='Flow count with flow size perspective', autosize=False,
                width=1000, height=1000,
                margin=dict(l=60, r=50, b=60, t=80),
                )
fig.update_layout(scene=dict(xaxis_title='x: Time Slot',
                                    yaxis_title='y: Flow {}'.format('123'),
                                    zaxis_title='z: Flow Count',
                                    zaxis = dict(
                                        tickvals=[0, 1, 2, 3, 4, 5],
                                        ticktext = ["10^0", "10^1", "10^2", "10^3", "10^4", "10^5"],
                                    )
                                    ))
        
        
fig.update_layout(scene=dict(
            yaxis=dict(range=[0,k]))
        )
fig.show() 

In [128]:
# plot_df2['Flow len.67']
plot_df2.index[plot_df2['Flow len.68'] == 1669]

Int64Index([263], dtype='int64')

In [130]:
plot_df['Count.68'][263]

1.0

In [117]:
log_plot_df['Count'][414].dtype

dtype('float64')

In [27]:
plot_df.values[67]

107

In [66]:
np.log10(1)

0.0