In [89]:
import csv, datetime, getopt, glob, itertools, logging, os, sys, time
import helpers
import numpy as np
import pandas as pd
import pyshark

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit

In [90]:
# Global Variables

devices_devicenames = ['August1', 'August2', 'Door1', 'Door2', 'Energy1', 'Energy2', 'Kevo', 'Push', 'Room1', 'Room2', 'Weather']
devices_publicaddrs = ['Home1', 'Home2']

id_devicenames = [['Kevo','Unikey'],
                'Eve Door 91B3',
                'Eve Door DC42',
                's',
                'Aug',
                'L402EL4',
                'Eve Energy 51C0',
                'Eve Energy 556E',
                'Eve Weather 943D',
                'Eve Room 8F24',
                'Eve Room 4A04']

BLE_DEVICES = sorted(devices_devicenames + devices_publicaddrs)

# Devices that can be identified using public (static) advertising addresses
DEVICES_PUBLICADDRS = {'ec:fe:7e:14:44:be' : 'Home1', 
                       'ec:fe:7e:14:44:a1' : 'Home2'}

# Devices that can be identified using device names
DEVICES_NAMES = {'August1': 'L402EL4',
                'August2': 'Aug',
                'Door1': 'Eve Door 91B3',
                'Door2': 'Eve Door DC42',
                'Energy1': 'Eve Energy 556E',
                'Energy2': 'Eve Energy 51C0',
                'Kevo': ['Kevo', 'Unikey'],
                'Push': 's',
                'Room1': 'Eve Room 4A04',
                'Room2': 'Eve Room 8F24',
                'Weather': 'Eve Weather 943D'}

# Just the reverse of DEVICES_NAMES
NAMES_DEVICES = {'Aug': 'August2',
                 'Eve Door 91B3': 'Door1',
                 'Eve Door DC42': 'Door2',
                 'Eve Energy 51C0': 'Energy2',
                 'Eve Energy 556E': 'Energy1',
                 'Eve Room 4A04': 'Room1',
                 'Eve Room 8F24': 'Room2',
                 'Eve Weather 943D': 'Weather',
                 'Kevo': 'Kevo',
                 'L402EL4': 'August1',
                 'Unikey': 'Kevo',
                 's': 'Push'}

DEVICE_TYPES = {'August1': 'lock',
                'August2': 'lock',
                'Door1': 'door',
                'Door2': 'door',
                'Energy1': 'plug',
                'Energy2': 'plug',
                'Home1': 'door',
                'Home2': 'door',
                'Kevo': 'lock',
                'Push': 'temp',
                'Room1': 'temp',
                'Room2': 'temp',
                'Weather': 'temp'}

TRAINING_TEST = {'August1': 'train',
                 'August2': 'test',
                 'Door1': 'train',
                 'Door2': 'test',
                 'Energy1': 'train',
                 'Energy2': 'train',
                 'Home1': 'train',
                 'Home2': 'train',
                 'Kevo': 'train',
                 'Push': 'train',
                 'Room1': 'train',
                 'Room2': 'test',
                 'Weather': 'train'}

SRC_DIR = './BLE_Source/'
DST_DIR = './BLE_Destination/'
PCAP_DIR = '/root/Documents/Thesis/BLE_PCAPS/'
TIMING_PKT_NUMBER = 25000

FEATURES = ['DeviceName', 'AccessAddr', 'AdvertAddr', 'BLE_LL_Length', 'PDUType', 'TxAddr', 'CompanyID','ScanAddr',
           'RFChannel', 'PacketLength', 'Time']

path_name = os.getcwd()
DATE = path_name[path_name.rindex('/')+1:]
PROC_TIME = "ble_processing_time_" + DATE + ".csv"

In [91]:
def parse_packet(pkt, tgt_files_by_src):
    """
    Parses a given packet and extracts the following features:
        (BLE LL)
        - device name
        - access address
        - advertising address
        - BLE LL packet length (bytes)
        - PDU type
        - Tx address type (public or random)
        - company id
        - scanning address (if SCAN_REQ pdu_type)
        
        (BLE RF)
        - rf channel (same as advertising channel: RF 0 = ADV 37, 12 = 38, 39 = 39)
        
        (Frame)
        - total frame length (bytes)
        - epoch time (seconds)      
        
    The features of the packet are written out to a csv row, which is
    in turn written out to a csv file in the given dictionaries.
    
    Parameters
    ----------
    pkt: (Pyshark packet object) the packet from which features will be extracted
    tgt_files_by_src: (dictionary) a dictionary of open csv files.
        The keys are device source addresses, and the values are the open csv files.
    tgt_files_by_dst: (dictionary) a dictionary of open csv files.
        The keys are device destination addresses, and the values are the open csv files.
    """
    
    public_addrs = DEVICES_PUBLICADDRS.keys()
    known_names = NAMES_DEVICES.keys()
    
    try:        
        # Find devices with known advertising addresses or device_names
        advAddr = pkt.btle.get_field_value('advertising_address')        
        name = pkt.btle.get_field_value('btcommon_eir_ad_entry_device_name')
        
        # Assign an identifier based on whether a known advAddr or name was found
        identifier, identifier_type = (advAddr,'advAddr') if name == None else (name,'name')       
        
        if (identifier in public_addrs) or (identifier in known_names):
#         if identifier in known_names:           
            # BLE LL features
            deviceName = pkt.btle.get_field_value('btcommon_eir_ad_entry_device_name')
            accessAddr = pkt.btle.get_field_value('access_address')
            advAddr = pkt.btle.get_field_value('advertising_address')
            bleLength = pkt.btle.get_field_value('length')
            pduType = pkt.btle.get_field_value('advertising_header_pdu_type')
            txAddr = pkt.btle.get_field_value('advertising_header_randomized_tx')
            companyID = pkt.btle.get_field_value('btcommon_eir_ad_entry_company_id')
            scanAddr = pkt.btle.get_field_value('scanning_address')
            
            #BLE RF
            rfChannel = pkt.btle_rf.get_field_value('channel')
            
            #Bluetooth
            pktLength = pkt.frame_info.get_field_value('len')
            epochTime = pkt.frame_info.get_field_value('time_epoch')        
                        
            # Output matches the order of FEATURES
            output = [deviceName, accessAddr, advAddr, bleLength,
                      pduType, txAddr, companyID, scanAddr,
                      rfChannel,
                      pktLength, epochTime]
            
            id_key = DEVICES_PUBLICADDRS[identifier] if identifier_type == 'advAddr' else NAMES_DEVICES[identifier]
            csv.writer(tgt_files_by_src[id_key]).writerow(output)
           
    
    except AttributeError:
        print "ignored: ", pkt.number            

In [98]:
def ble_extract_packet_features(filename = os.path.join(PCAP_DIR, 'master.cap'), create_master=True):
    """
    Unit that extracts wanted features out of packets in a packet capture file.
    The feature_extractor focuses on features derived from packet information. 
    Secondary features are processed by the make_dataframe function.
    Produces two csv files for each device in WIFI_DEVICES (see Global Variables).
    One file is for all packets where the device is the source; the other is where the device is the destination.
    
    Parameters
    ----------
    filename: (string) the absolute path of the packet capture file
    
    Output
    ------
    Source directory: (filesystem) creates a directory containing csv files for each device 
        where it is the source of the packet
    Destination directory: (filesystem) creates a directory containing csv files for each device 
        where it is the destination of the packet
    
    Returns
    -------
    none
    """
    
    # Prepare writers
    pt_file = open(PROC_TIME, 'w')
    csv.writer(pt_file).writerow(["Unit", "Total Packets Processed", "Total Process Time", "Average Process Time"])
    pt_file.close()

    # Initialize counters
    pkt_count = 0
    total_time_processing = 0
    total_time_start = time.time()

    # Initialize dicts for each device
    tgt_files_by_src = {}
    
    # Combine all pcaps in directory in one master pcap
    if (create_master):
        try:
            ret = os.system('mergecap /root/Documents/Thesis/BLE_PCAPS/*.pcap -w /root/Documents/Thesis/BLE_PCAPS/master.cap')
            if ret != 0:
                raise OSError
        except OSError:
            print 'Could not make master capture file'

    # Initialize capture file 
    cap = pyshark.FileCapture(filename, only_summaries=False)

    # Get time of first packet
    prev_pkt_time = cap[0].frame_info.time_epoch

    # Initialize output folders
    helpers.init_dirs('ble')
    
    # Open output files for each Wi-Fi device
    for device in BLE_DEVICES:
        tgt_files_by_src[device] = open(SRC_DIR + device + ".csv", 'a')
        
        # Initialize with column headers
        csv.writer(tgt_files_by_src[device]).writerow(FEATURES)
    
    # Go through each packet in capture, and store pertinent packets to csv files
    for pkt in cap:
        
        if pkt_count % TIMING_PKT_NUMBER == 0:
            print "Working packet #", pkt_count, "..."
        pkt_count += 1

        time_start_singlepacket = time.time()
        parse_packet(pkt, tgt_files_by_src)
        total_time_processing += time.time() - time_start_singlepacket

    total_time_elapsed = time.time() - total_time_start
    
    # Close files
    for open_file in tgt_files_by_src.values():
        open_file.close()
        
    # Calculate time variables
    final_time = time.time()
    normalized_total_time = (TIMING_PKT_NUMBER * total_time_elapsed) / pkt_count
    normalized_processing_time = (TIMING_PKT_NUMBER * total_time_processing) / pkt_count

    # Print time variables
    print "Total number of packets processed: ", pkt_count
    print "Total data processing time: ", total_time_elapsed
    print "Normalized total processing time per 25k packets: ", normalized_total_time
    print "Total capture file processing time: ", total_time_processing
    print "Normalized capture file processing time: ", normalized_processing_time

    # Print out time metrics to csv
    pt_file = open(PROC_TIME, 'a')
    csv.writer(pt_file).writerow(["Packet capture iteration", pkt_count, total_time_processing, normalized_processing_time])
    csv.writer(pt_file).writerow(["Component start and finish time", total_time_start, final_time, final_time-total_time_start])
    pt_file.close()

In [99]:
ble_extract_packet_features(filename='/root/Documents/Thesis/BLE_PCAPS/home1home2-15min.pcap', create_master=False)

Old ./BLE_Source deleted
Old ./BLE_Destination deleted
Working packet # 0 ...
Working packet # 25000 ...
Working packet # 50000 ...
Working packet # 75000 ...
Working packet # 100000 ...
Working packet # 125000 ...
Working packet # 150000 ...
Working packet # 175000 ...
Working packet # 200000 ...
Total number of packets processed:  200418
Total data processing time:  246.268482924
Normalized total processing time per 25k packets:  30.7193569095
Total capture file processing time:  48.9026987553
Normalized capture file processing time:  6.10008816015
