In [22]:
import xml.etree.ElementTree as ET
import time
import pickle
import pandas as pd
import seaborn as sns
import glob
import multiprocessing
import os
import numpy as np

In [2]:
dir_path="/data/thomas/Principals/FL/deployment/iotlab_data/pcap_features/pdml/"
number_of_cores=40
IP_SUBNET = "192.168."
OUTPUT_DIR = "/data/thomas/Principals/FL/deployment/iotlab_data/pcap_features/features"

In [3]:
for pdml_file in glob.glob(dir_path+'/*.pdml'):
    continue

In [4]:
def recursive_lookup(element):
    res = []
    for child in element.findall('field'):
        field_name = child.get('name')
        res.append(field_name)
        res.extend(recursive_lookup(child))
    return res    

In [5]:
def extract_field_names(element):
    # Initialize a list to store results
    res = []
    ip = 0
    
    # Iterate over child elements
    for proto in element.findall('proto'):
        for child in proto.findall('field'):
            field_name = child.get('name')
            res.append(field_name)
            if field_name == 'ip.src':
                src = child.get('show')
                if src.startswith(IP_SUBNET):
                    ip = src
            if field_name == 'ip.dst':
                dst = child.get('show')
                if dst.startswith(IP_SUBNET):
                    ip = dst
            
            res.extend(recursive_lookup(child))
    return ip, set(res)

In [6]:
def fields_per_ip_per_file(file):
    start_time = time.time()
    res={}
    with open(file, 'rb') as f:
        context = ET.iterparse(f, events=('start', 'end'))
        _, root = next(context)  # get root element
        total = 0
        for event, elem in context:
            if event == 'end' and elem.tag == 'packet':
                ip, fields = extract_field_names(elem)
                if ip in res:
                    res[ip] = res[ip].union(fields)
                else:
                    res[ip] = fields
                root.clear()
                total += 1
    #print(total, time.time() - start_time)
    return res
    

In [7]:
def combine_fields_different_files(dict_list):
    res = {}
    for d in dict_list:
        for ip,fields in d.items():
            if ip in res:
                res[ip] = res[ip].union(fields)
            else:
                res[ip] = fields
    return res        

In [8]:
def get_all_unique_fields(fields_per_ip_dict):
    unique_f = set([])
    for ip, fields in fields_per_ip_dict.items():
        unique_f = unique_f.union(fields)
    return unique_f

In [None]:
with multiprocessing.Pool(processes=number_of_cores) as pool:
    # Submit tasks asynchronously
    results = [pool.apply_async(fields_per_ip_per_file, args=(pdml_file,)) for pdml_file in glob.glob(dir_path+'/*.pdml')]

    # Wait for all results to complete
    for result in results:
        result.get() 

In [15]:
all_fields_per_ip = combine_fields_different_files([result.get() for result in results])
all_unique_fields = get_all_unique_fields(all_fields_per_ip)

In [16]:
def recursive_lookup_features(element):
    res = {}
    for child in element.findall('field'):
        field_name = child.get('name')
        field_value = child.get('show')
        res[field_name] = field_value
        res.update(recursive_lookup_features(child))
    return res   

In [17]:
def extract_field_name_value(element):
    # Initialize a list to store results
    res = {}
    ip = 0
    
    # Iterate over child elements
    for proto in element.findall('proto'):
        for child in proto.findall('field'):
            field_name = child.get('name')
            value = child.get('show')
            res[field_name] = value
            if field_name == 'ip.src':
                src = child.get('show')
                if src.startswith(IP_SUBNET):
                    ip = src
            if field_name == 'ip.dst':
                dst = child.get('show')
                if dst.startswith(IP_SUBNET):
                    ip = dst
            
            res.update(recursive_lookup_features(child))
    res['device_id'] = ip
    return res

In [19]:
def features_per_packet_per_file(unique_fields,file):
    res=[]
    with open(file, 'rb') as f:
        context = ET.iterparse(f, events=('start', 'end'))
        _, root = next(context)  # get root element
        total = 0
        for event, elem in context:
            if event == 'end' and elem.tag == 'packet':
                res.append(extract_field_name_value(elem))
                root.clear()
                total += 1
    df = pd.DataFrame(res)
    missing_columns = [col for col in unique_fields if col not in df.columns]
    for col in missing_columns:
        df[col] = np.nan
    filename = file.split('/')[-1].replace('pdml','csv')
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    output_filename = f"{OUTPUT_DIR}/{filename}"
    df = df[unique_fields] #fix the order of features to be always the same in the csv
    df.to_csv(output_filename, index=False) 
    return True

In [None]:
with multiprocessing.Pool(processes=number_of_cores) as pool:
    results = [pool.apply_async(features_per_packet_per_file, args=(all_unique_fields,pdml_file,)) for pdml_file in glob.glob(dir_path+'/*.pdml')]

    for result in results:
        result.get()