In [None]:
from EIMTC.extractor import Extractor
from EIMTC.plugins.protocol_header_fields import ProtocolHeaderFields
from EIMTC.plugins.stnn import STNN
from EIMTC.plugins.n_pkts_byte_freq import NPacketsByteFrequency
from EIMTC.plugins.n_bytes import NBytes
from EIMTC.plugins.pkt_rel_time import PacketRelativeTime
from EIMTC.plugins.clump_flows import Clump_Flow
from EIMTC.plugins.res_req_diff_time import ResReqDiffTime
import glob
from pathlib import Path

In [None]:
files = glob.glob('./data/pcaps/**/*.pcap')
files

In [None]:
ext = Extractor(
    output_dirpath='./data/',
    custom_plugin_package=[
        ProtocolHeaderFields(n_packets=32), # Lopez
        NBytes(n=784), # wang
        STNN(n_packets=32),
        NPacketsByteFrequency(n_first_packets=6),
        PacketRelativeTime(),
        ResReqDiffTime(),
        Clump_Flow(),
    ],
    TLS=True
)
ext

### Note:
The app/service labels were extracted via the SNI value of each TLS flow and later mapped into a specific domain/application name.

The extracted values are in the file2label2.csv file which is compressed inside the file2labels.7z file. Please extract the CSV file before executing the next cells.

In [None]:
import pandas as pd
ddf = pd.read_csv('./file2labels.csv')

In [None]:
def labels_from_name(name):
    tokens = name.lower().split('_')
    os_label = tokens[0]
    browser_label = tokens[2]

    return expand_os_label(os_label), expand_browser_label(browser_label)


def expand_os_label(os_label):
    min2expand_map = {
        'l': 'Linux',
        'w': 'Windows',
        'd': 'OSX'
    }

    if os_label in min2expand_map:
        return min2expand_map[os_label]
    else:
        return 'Unknown'


def expand_browser_label(browser_label):
    min2expand_map = {
        'chrome': 'Chrome',
        'ff': 'Firefox',
        'firefox': 'Firefox',
        'ie': 'IExplorer',
        'safari': 'Safari'
    }

    if browser_label in min2expand_map:
        return min2expand_map[browser_label]
    else:
        return 'Unknown'

def expand_application_label(file_name):
    app = ddf[ddf['filename'] == file_name]['mapped_sni'].iloc[0]
    return app
    

# tests
assert expand_os_label('l') == 'Linux'
assert expand_os_label('w') == 'Windows'
assert expand_os_label('d') == 'OSX'
assert expand_os_label('wtf') == 'Unknown'

assert expand_browser_label('ff') == 'Firefox'
assert expand_browser_label('firefox') == 'Firefox'
assert expand_browser_label('wtf') == 'Unknown'
assert expand_browser_label('ie') == 'IExplorer'

assert expand_application_label('L_cyber_ff_09-08__18_05_04.pcap.TCP_10-0-0-9_48478_216-58-210-14_443.pcap') == 'google'
assert expand_application_label('d_hi_safari_Ai_Se_Eu_Te_Pego_16_15_38_144p.pcap.TCP_10-0-0-11_49607_212-179-17-140_443.pcap') == 'youtube'
assert expand_application_label('d_hi_safari_Ai_Se_Eu_Te_Pego_16_18_43_240p.pcap.TCP_10-0-0-11_49659_212-179-17-140_443.pcap') == 'youtube'


assert labels_from_name('L_cyber_ff_09-08__18_05_04.pcap.TCP_10-0-0-9_48478_216-58-210-14_443.pcap') == ('Linux', 'Firefox')
assert labels_from_name('l_cyber_firefox') == ('Linux', 'Firefox')
assert labels_from_name('W_cybfdser_chrome') == ('Windows', 'Chrome')
assert labels_from_name('g_cybfdser_come') == ('Unknown', 'Unknown')


In [None]:
def custom_filepath_based_labelling(filepath):
    '''
    returns: dictionary of label's name/type as key and the label as value.
    '''
    filename = Path(filepath).stem
    labels = labels_from_name(filename)
    app_label = expand_application_label(filename+'.pcap')
    labels = labels+(app_label,)
    label_names = ['os', 'browser','application']
    return dict(
        zip(label_names, labels)
    )

assert custom_filepath_based_labelling('D:/DISTILLER/BOA2016/data/filtered_raw_dataset_temu2016/d_hi_safari_Ai_Se_Eu_Te_Pego_16_15_38_144p.pcap.TCP_10-0-0-11_49607_212-179-17-140_443.pcap') == {'os': 'OSX', 'browser': 'Safari', 'application': 'youtube'}

In [None]:
ext.extract_many(files, labelling_method=custom_filepath_based_labelling)