# Group 06 - ULg traffic characterization

## Initialisation

In [None]:
import pandas as pd
import numpy as np

from utils import *
from ipaddress import *

import time

start_time = time.time()

CHUNKSIZE = 10 ** 5

ibyts = np.array([])
obyts = np.array([])
ipkts = np.array([])
opkts = np.array([])
durations = np.array([])

port_traffic_sender = pd.DataFrame({'sp': [-1], 'ibyt': [-1], 'obyt': [-1]}, index=[0]) # Init with row of port -1 to set the columns as integers
port_traffic_receiver = pd.DataFrame({'dp': [-1], 'ibyt': [-1], 'obyt': [-1]}, index=[0])

traffic_by_ip = pd.DataFrame({'sa': [-1], 'ibyt': [-1], 'obyt': [-1]}, index=[0])

traffic_by_prefix_source = pd.DataFrame({'source_netw': [-1], 'ibyt': [-1], 'ipkt': [-1]}, index=[0])
traffic_by_prefix_dest = pd.DataFrame({'dest_netw': [-1], 'ibyt': [-1], 'ipkt': [-1]}, index=[0])


i = 0
df_save = pd.DataFrame()
for df in pd.read_csv("netflow.csv", chunksize=CHUNKSIZE, iterator=True):
    i += 1
    print("Chunk number {}".format(i))
    df = df.dropna()

    ibyts = np.append(ibyts, [ df[['ibyt']].mean() ])
    obyts = np.append(obyts, [ df[['obyt']].mean() ])
    ipkts = np.append(ipkts, [df[['ipkt']].mean()])
    opkts = np.append(opkts, [df[['opkt']].mean()])
    durations = np.append(durations, df[['td']].mean())

    # Compute traffic by port
    port_traffic_sender = pd.concat([port_traffic_sender, df[['sp', 'ibyt', 'obyt']]])
    gb_sender = port_traffic_sender.groupby('sp')
    port_traffic_sender = gb_sender.sum().reset_index()

    port_traffic_receiver = pd.concat([port_traffic_receiver, df[['dp', 'ibyt', 'obyt']]])
    gb_receiver = port_traffic_receiver.groupby('dp')
    port_traffic_receiver = gb_receiver.sum().reset_index()

    df = df[df['sa'] != -1]
    df['source_netw'] = df['sa'].apply(lambda x: str(ip_interface(x + '/24').network))
    df['dest_netw'] = df['da'].apply(lambda x: str(ip_interface(x + '/24').network))

    traffic_by_prefix_source = pd.concat([traffic_by_prefix_source, df[['source_netw', 'ibyt', 'ipkt']]])
    gb_netw_source = traffic_by_prefix_source.groupby('source_netw')
    traffic_by_prefix_source = gb_netw_source.sum().reset_index()

    traffic_by_prefix_dest = pd.concat([traffic_by_prefix_dest, df[['dest_netw', 'ibyt', 'ipkt']]])
    gb_netw_source = traffic_by_prefix_dest.groupby('dest_netw')
    traffic_by_prefix_dest = gb_netw_source.sum().reset_index()




# -- Post-processing

port_traffic_sender = port_traffic_sender[port_traffic_sender.sp != -1]
port_traffic_sender['bytes_tot'] = port_traffic_sender['ibyt'] + port_traffic_sender['obyt']

port_traffic_receiver = port_traffic_receiver[port_traffic_receiver.dp != -1]
port_traffic_receiver['bytes_tot'] = port_traffic_receiver['ibyt'] + port_traffic_receiver['obyt']


# ---- COMPUTATIONS

# ---- Average packet size

ipkt_size = ibyts/ipkts
ipkt_size = ipkt_size[~np.isnan(ipkt_size)]
if ipkt_size.size > 0:
    print("Average packet size (input): {0:.2f}\n".format(ipkt_size.mean()))

opkt_size = obyts/opkts
opkt_size = opkt_size[~np.isnan(opkt_size)]
if opkt_size.size > 0:
    print("Average packet size (output): {0:.2f}\n".format(opkt_size.mean()))


# 92.106.195.0/24 address block
total_traffic_source_pkts = traffic_by_prefix_source['ipkt'].sum()
total_traffic_source_byts = traffic_by_prefix_source['ibyt'].sum()
total_traffic_dest_pkts = traffic_by_prefix_dest['ipkt'].sum()
total_traffic_dest_byts = traffic_by_prefix_dest['ibyt'].sum()

print('Fraction of traffic sent by 92.106.195.0/24 (in pkts): {:.3%}'.format(traffic_by_prefix_source[traffic_by_prefix_source['source_netw'] == '92.106.195.0/24']['ipkt'].iloc[0]/total_traffic_source_pkts))
print('Fraction of traffic sent by 92.106.195.0/24 (in bytes): {:.3%}'.format(traffic_by_prefix_source[traffic_by_prefix_source['source_netw'] == '92.106.195.0/24']['ibyt'].iloc[0]/total_traffic_source_byts))
print('Fraction of traffic sent to 92.106.195.0/24 (in pkts): {:.3%}'.format(traffic_by_prefix_dest[traffic_by_prefix_dest['dest_netw'] == '92.106.195.0/24']['ipkt'].iloc[0]/total_traffic_dest_pkts))
print('Fraction of traffic sent to 92.106.195.0/24 (in bytes): {:.3%}'.format(traffic_by_prefix_dest[traffic_by_prefix_dest['dest_netw'] == '92.106.195.0/24']['ibyt'].iloc[0]/total_traffic_dest_byts))




print("Execution time: {}".format(time.time() - start_time))

Chunk number 1
Chunk number 2
Chunk number 3
Chunk number 4
Chunk number 5

## Questions

### What is the average packet size, across all traffic in the trace (in and out)? Describe how you computed this number.