In [51]:
import csv, datetime, getopt, glob, itertools, logging, os, sys, time
import helpers
import numpy as np
import pandas as pd
import pyshark

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit

In [52]:
# Global Variables

devices_devicenames = ['August1', 'August2', 'Door1', 'Door2', 'Energy1', 
                       'Energy2', 'Kevo', 'Push', 'Room1', 'Room2', 'Weather']
devices_publicaddrs = ['Home1', 'Home2']

id_devicenames = [['Kevo','Unikey'],
                'Eve Door 91B3',
                'Eve Door DC42',
                's',
                'Aug',
                'L402EL4',
                'Eve Energy 51C0',
                'Eve Energy 556E',
                'Eve Weather 943D',
                'Eve Room 8F24',
                'Eve Room 4A04']

BLE_DEVICES = sorted(devices_devicenames + devices_publicaddrs)

# Devices that can be identified using public (static) advertising addresses
DEVICES_PUBLICADDRS = {'ec:fe:7e:14:44:be' : 'Home1', 
                       'ec:fe:7e:14:44:a1' : 'Home2'}

# Devices that can be identified using device names
DEVICES_NAMES = {'August1': 'L402EL4',
                'August2': 'Aug',
                'Door1': 'Eve Door 91B3',
                'Door2': 'Eve Door DC42',
                'Energy1': 'Eve Energy 556E',
                'Energy2': 'Eve Energy 51C0',
                'Kevo': ['Kevo', 'Unikey'],
                'Push': 's',
                'Room1': 'Eve Room 4A04',
                'Room2': 'Eve Room 8F24',
                'Weather': 'Eve Weather 943D'}

# Just the reverse of DEVICES_NAMES
NAMES_DEVICES = {'Aug': 'August2',
                 'Eve Door 91B3': 'Door1',
                 'Eve Door DC42': 'Door2',
                 'Eve Energy 51C0': 'Energy2',
                 'Eve Energy 556E': 'Energy1',
                 'Eve Room 4A04': 'Room1',
                 'Eve Room 8F24': 'Room2',
                 'Eve Weather 943D': 'Weather',
                 'Kevo': 'Kevo',
                 'L402EL4': 'August1',
                 'Unikey': 'Kevo',
                 's': 'Push'}

DEVICE_TYPE = {'August1': 'lock',
                'August2': 'lock',
                'Door1': 'door',
                'Door2': 'door',
                'Energy1': 'plug',
                'Energy2': 'plug',
                'Home1': 'door',
                'Home2': 'door',
                'Kevo': 'lock',
                'Push': 'temp',
                'Room1': 'temp',
                'Room2': 'temp',
                'Weather': 'temp'}

TRAINING_TEST = {'August1': 'train',
                 'August2': 'test',
                 'Door1': 'train',
                 'Door2': 'test',
                 'Energy1': 'train',
                 'Energy2': 'train',
                 'Home1': 'train',
                 'Home2': 'train',
                 'Kevo': 'train',
                 'Push': 'train',
                 'Room1': 'train',
                 'Room2': 'test',
                 'Weather': 'train'}

PDU_TYPES = {0: 'ADV_IND',
             1: 'ADV_DIRECT_IND',
             2: 'ADV_NONCONN_IND',
             3: 'SCAN_REQ',
             4: 'SCAN_RSP',
             5: 'CONNECT_REQ',
             6: 'ADV_SCAN_IND'}

SRC_DIR = './BLE_Source/'
DST_DIR = './BLE_Destination/'
PCAP_DIR = '/root/Documents/Thesis/BLE_PCAPS/'
TIMING_PKT_NUMBER = 25000

FEATURES = ['Name', 'DeviceName', 'AccessAddr', 'AdvertAddr', 'BLE_LL_Length', 
            'PDUTypeNum', 'TxAddr', 'CompanyID','ScanAddr',
            'RFChannel', 'PacketLength', 'Time']

path_name = os.getcwd()
DATE = path_name[path_name.rindex('/')+1:]
PROC_TIME = "ble_processing_time_" + DATE + ".csv"

In [53]:
def parse_packet(pkt, tgt_files_by_src):
    """
    Parses a given packet and extracts the following features:
        (BLE LL)
        - device name
        - access address
        - advertising address
        - BLE LL packet length (bytes)
        - PDU type
        - Tx address type (public or random)
        - company id
        - scanning address (if SCAN_REQ pdu_type)
        
        (BLE RF)
        - rf channel (same as advertising channel: RF 0 = ADV 37, 12 = 38, 39 = 39)
        
        (Frame)
        - total frame length (bytes)
        - epoch time (seconds)      
        
    The features of the packet are written out to a csv row, which is
    in turn written out to a csv file in the given dictionaries.
    
    Parameters
    ----------
    pkt: (Pyshark packet object) the packet from which features will be extracted
    tgt_files_by_src: (dictionary) a dictionary of open csv files.
        The keys are device source addresses, and the values are the open csv files.
    tgt_files_by_dst: (dictionary) a dictionary of open csv files.
        The keys are device destination addresses, and the values are the open csv files.
    """
    
    public_addrs = DEVICES_PUBLICADDRS.keys()
    known_names = NAMES_DEVICES.keys()
    
    try:        
        # Find devices with known advertising addresses or device_names
        advAddr = pkt.btle.get_field_value('advertising_address')        
        name = pkt.btle.get_field_value('btcommon_eir_ad_entry_device_name')
        
        # Assign an identifier based on whether a known advAddr or name was found
        identifier, identifier_type = (advAddr,'advAddr') if name == None else (name,'name')       
        
        if (identifier in public_addrs) or (identifier in known_names):
                       
            # BLE LL features
            deviceName = pkt.btle.get_field_value('btcommon_eir_ad_entry_device_name')
            accessAddr = pkt.btle.get_field_value('access_address')
            advAddr = pkt.btle.get_field_value('advertising_address')
            bleLength = pkt.btle.get_field_value('length')
            pduType = pkt.btle.get_field_value('advertising_header_pdu_type')
            txAddr = pkt.btle.get_field_value('advertising_header_randomized_tx')
            companyID = pkt.btle.get_field_value('btcommon_eir_ad_entry_company_id')
            scanAddr = pkt.btle.get_field_value('scanning_address')
            
            # BLE RF
            rfChannel = pkt.btle_rf.get_field_value('channel')
            
            # Bluetooth
            pktLength = pkt.frame_info.get_field_value('len')
            epochTime = pkt.frame_info.get_field_value('time_epoch')       
            
            # Name as used in thesis document
            name = DEVICES_PUBLICADDRS[identifier] if identifier_type == 'advAddr' else NAMES_DEVICES[identifier]
                        
            # Output matches the order of FEATURES
            output = [name, deviceName, accessAddr, advAddr, bleLength, pduType, txAddr, companyID, scanAddr,
                      rfChannel,
                      pktLength, epochTime]
            
            # Write features to csv           
            csv.writer(tgt_files_by_src[name]).writerow(output)
           
    
    except AttributeError:
        print "ignored: ", pkt.number            

In [65]:
def ble_extract_packet_features(filename = os.path.join(PCAP_DIR, 'master.pcap'), create_master=True):
    """
    Unit that extracts wanted features out of packets in a packet capture file.
    The feature_extractor focuses on features derived from packet information. 
    Secondary features are processed by the make_dataframe function.
    Produces two csv files for each device in WIFI_DEVICES (see Global Variables).
    One file is for all packets where the device is the source; the other is where the device is the destination.
    
    Parameters
    ----------
    filename: (string) the absolute path of the packet capture file
    
    Output
    ------
    Source directory: (filesystem) creates a directory containing csv files for each device 
        where it is the source of the packet
    Destination directory: (filesystem) creates a directory containing csv files for each device 
        where it is the destination of the packet
    
    Returns
    -------
    none
    """
    
    # Prepare writers
    pt_file = open(PROC_TIME, 'w')
    csv.writer(pt_file).writerow(["Unit", "Total Packets Processed", "Total Process Time", "Average Process Time"])
    pt_file.close()

    # Initialize counters
    pkt_count = 0
    total_time_processing = 0
    total_time_start = time.time()

    # Initialize dicts for each device
    tgt_files_by_src = {}
    
    # Combine all pcaps in directory in one master pcap
    if (create_master):
        try:
            if os.path.exists("/root/Documents/Thesis/BLE_PCAPS/master.pcap"):
                os.remove("/root/Documents/Thesis/BLE_PCAPS/master.pcap")
                
            ret = os.system('mergecap /root/Documents/Thesis/BLE_PCAPS/*.pcap -w 
                            /root/Documents/Thesis/BLE_PCAPS/master.pcap')
            if ret != 0:
                raise OSError
        except OSError:
            print 'Could not make master capture file'

    # Initialize capture file 
    cap = pyshark.FileCapture(filename, only_summaries=False)

    # Get time of first packet
    prev_pkt_time = cap[0].frame_info.time_epoch

    # Initialize output folders
    helpers.init_dirs('ble')
    
    # Open output files for each Wi-Fi device
    for device in BLE_DEVICES:
        tgt_files_by_src[device] = open(SRC_DIR + device + ".csv", 'a')
        
        # Initialize with column headers
        csv.writer(tgt_files_by_src[device]).writerow(FEATURES)
    
    # Go through each packet in capture, and store pertinent packets to csv files
    for pkt in cap:
        
        if pkt_count % TIMING_PKT_NUMBER == 0:
            print "Working packet #", pkt_count, "..."
        pkt_count += 1

        time_start_singlepacket = time.time()
        parse_packet(pkt, tgt_files_by_src)
        total_time_processing += time.time() - time_start_singlepacket

    total_time_elapsed = time.time() - total_time_start
    
    # Close files
    for open_file in tgt_files_by_src.values():
        open_file.close()
        
    # Calculate time variables
    final_time = time.time()
    normalized_total_time = (TIMING_PKT_NUMBER * total_time_elapsed) / pkt_count
    normalized_processing_time = (TIMING_PKT_NUMBER * total_time_processing) / pkt_count

    # Print time variables
    print "Total number of packets processed: ", pkt_count
    print "Total data processing time: ", total_time_elapsed
    print "Normalized total processing time per 25k packets: ", normalized_total_time
    print "Total capture file processing time: ", total_time_processing
    print "Normalized capture file processing time: ", normalized_processing_time

    # Print out time metrics to csv
    pt_file = open(PROC_TIME, 'a')
    csv.writer(pt_file).writerow(["Packet capture iteration", pkt_count, 
                                  total_time_processing, normalized_processing_time])
    csv.writer(pt_file).writerow(["Component start and finish time", total_time_start, 
                                  final_time, final_time-total_time_start])
    pt_file.close()

In [55]:
def count_assoc_pkts(df, device):
    """
    Gets the count of packets of a given device that are sent within a second of each other (associated packets)
    
    Parameters
    ----------
    df: (dataframe) the dataframe containing the packet information
    device: (string) the name of the device for which the assoc_pkt count will be calculated
    
    Output
    ------
    None
    
    Returns
    -------
    assoc_count: (pandas series) contains the assoc_packet count for each packet. 
                Uses the index of the packet from the dataframe
    """
        
    ASSOC_PKT_THRESHOLD = 1 # the threshold in seconds within which a packet will be considered an assoc_pkt

    # Extract time values of all packets belonging to a certain device
    df_device = df[df["Name"]==device]
    pkt_time_values = np.array(df_device["Time"].values)
    
    assoc_pkt_counts = []
    
    # Iterate through each packet of the device
    for pkt_index in range(0,len(df_device)):  

        # Create an array of size=len(pkt_time_values) that contains the time value of packet X
        pkt_time = np.full((len(pkt_time_values),),df_device.iloc[pkt_index]["Time"])

        # Calculate the time difference between packet X and all other packets
        diff = np.abs(np.subtract(pkt_time, pkt_time_values))

        # Calculate the count of packets that would be considered an assoc_pkt based on ASSOC_PKT_THRESHOLD
        assoc_pkts = (diff <= ASSOC_PKT_THRESHOLD).sum()
        assoc_pkt_counts.append(assoc_pkts)
        
    
    assoc_count = pd.Series(assoc_pkt_counts, index=df_device.index)
    return assoc_count

In [56]:
def make_dataframe(path='/root/Documents/Thesis/Code/BLE_Source'):
    """
    Unit that takes all the csv files produced by the feature_extractor unit 
    and puts them into a pandas dataframe.
    Returns a clean dataframe with all good data

    Parameters
    ----------
    path: (filesystem) the absolute path of the folder containing the csv files

    Output
    ------
    none

    Returns
    -------
    dataframe: (pandas dataframe) a useful data structure for machine learning
    counts: (pandas series) packet counts for each device 
    """
    
    # Search the path for csv files
    all_csvs = glob.glob(os.path.join(path, "*.csv"))

    # Collect all csvs in one dataframe
    df_from_each_file = (pd.read_csv(f) for f in all_csvs)
    df = pd.concat(df_from_each_file, ignore_index=True, sort=False)

    # Add device type of each packet
    df["DeviceType"] = df["Name"].map(DEVICE_TYPE)
        
    # Add whether device is a training or test device
    df["Set"] = df["Name"].map(TRAINING_TEST)
    
    # One-hot encode device type (response variable)
    deviceType_series = pd.get_dummies(df["DeviceType"])
    df = pd.concat([df, deviceType_series], axis=1)
    
    # TODO: One-hot encode company ID 
    
    # TODO: One-hot encode access address
    
    # TODO: One-hot encode adv address
    
    # TODO: One-hot encode scanning address
    
    # One-hot encode PDU_type
    df["PDUType"] = df["PDUTypeNum"].map(PDU_TYPES)
    pduType_series = pd.get_dummies(df["PDUType"])
    df = pd.concat([df, pduType_series], axis=1)
    
    # Get number of associated packets for each packet
    list_assoc_pkts = []
#     for device in list(df["Name"].unique()):
    for device in BLE_DEVICES:
        assoc_pkts = count_assoc_pkts(df, device)
        list_assoc_pkts.append(assoc_pkts)
    df["Assoc_Packets"] = pd.concat(list_assoc_pkts)
    
    # Fill NaNs with 0
    df["CompanyID"] = df["CompanyID"].fillna(0)
    df["ScanAddr"] = df["ScanAddr"].fillna(0)
    
    # Count packets for each device
    device_counts = df["Name"].value_counts()
    print device_counts
        
    return df


In [57]:
def random_forest_classifier(X_train, y_train, X_test, y_test):
    time_start = time.time()
    
    randomforest = RandomForestClassifier(random_state=0, n_jobs=2)
    rf_model = randomforest.fit(X_train, y_train)

    preds = rf_model.predict(X_test)
    score = rf_model.score(X_test, y_test)
    
#     print_confusion_matrix()
    
    time_elapsed = time.time() - time_start
    return {'Score' : score, 'Time' : time_elapsed}

In [58]:
def k_neighbors_classifier(X_train, y_train, X_test, y_test):
    time_start = time.time()
    
    knn = KNeighborsClassifier(n_neighbors=5, n_jobs=2)
    knn_model = knn.fit(X_train, y_train)
    
    preds = knn_model.predict(X_test)
    score = knn_model.score(X_test, y_test)
    
#     print_confusion_matrix()
    
    time_elapsed = time.time() - time_start
    return {'Score' : score, 'Time' : time_elapsed}

In [59]:
def lda_classifier(X_train, y_train, X_test, y_test):
    time_start = time.time()
    
    lda = LinearDiscriminantAnalysis()
    lda_model = lda.fit(X_train, y_train)
    
    preds = lda_model.predict(X_test)
    score = lda_model.score(X_test, y_test)
    
#     print_confusion_matrix()

    time_elapsed = time.time() - time_start
    return {'Score' : score, 'Time' : time_elapsed}

In [60]:
def one_vs_all_classify(df, features_list, y_list):
    time_start = time.time()
    
    # Divide df by train and test devices
    df_test = df[df["Set"]=="test"]
    df_train = df[df["Set"]=="train"]
    
    # Train using chosen features
    X_train = df_train[features_list]
    X_test = df_test[features_list]

    for device_type in y_list:
        # Set one device type as y
        y_train = df_train[device_type]
        y_test = df_test[device_type]

        time_start_clf = time.time()

        rf_clf = random_forest_classifier(X_train, y_train, X_test, y_test)
        knn_clf = k_neighbors_classifier(X_train, y_train, X_test, y_test)
        lda_clf = lda_classifier(X_train, y_train, X_test, y_test)

        time_elapsed_clf = time.time() - time_start_clf

        print "Device Type:", device_type
        print "Random Forest Score:", rf_clf['Score'], "Time: ", rf_clf['Time']
        print "KNN Score:", knn_clf['Score'], "Time: ", knn_clf['Time']
        print "LDA Score:", lda_clf['Score'], "Time: ", lda_clf['Time']
        print "Total time (classifiers):", time_elapsed_clf
        print ""
    
    print "Total time (one vs all_classify):", time.time() - time_start
    print ""

In [61]:
def one_vs_one_classify(df, features_list, y_list):
    time_start = time.time()
    
    # Get possible combinations for one vs one
    combinations = [combination for combination in itertools.combinations(y_list, 2)]

    for device_pair in combinations:
        # Only use data with the two device types needed for one vs one classification
        pos_device_type = device_pair[0]
        neg_device_type = device_pair[1]
        df_1v1 = df[(df["DeviceType"]==pos_device_type) | (df["DeviceType"]==neg_device_type)]

        # Separate df into train and test sets
        df_train = df_1v1[df_1v1["Set"]=="train"]
        df_test = df_1v1[df_1v1["Set"]=="test"]
        X_train = df_train[features_list]
        X_test = df_test[features_list]
        y_train = df_train[pos_device_type]
        y_test = df_test[pos_device_type]
        
        time_start_clf = time.time()

        rf_clf = random_forest_classifier(X_train, y_train, X_test, y_test)
        knn_clf = k_neighbors_classifier(X_train, y_train, X_test, y_test)
        lda_clf = lda_classifier(X_train, y_train, X_test, y_test)

        time_elapsed_clf = time.time() - time_start_clf

        print "Device Pair:", device_pair
        print "Random Forest Score:", rf_clf['Score'], "Time: ", rf_clf['Time']
        print "KNN Score:", knn_clf['Score'], "Time: ", knn_clf['Time']
        print "LDA Score:", lda_clf['Score'], "Time: ", lda_clf['Time']
        print "Total time (classifiers):", time_elapsed_clf
        print ""
    
    print "Total time (one vs one_classify):", time.time() - time_start
    print ""

In [66]:
# ble_extract_packet_features(filename='/root/Documents/Thesis/BLE_PCAPS/home1home2-15min.pcap', create_master=False)
ble_extract_packet_features()

Old ./BLE_Source deleted
Old ./BLE_Destination deleted
Working packet # 0 ...
Working packet # 25000 ...
Working packet # 50000 ...
Working packet # 75000 ...
Working packet # 100000 ...
Working packet # 125000 ...
Working packet # 150000 ...
Working packet # 175000 ...
Working packet # 200000 ...
Working packet # 225000 ...
Working packet # 250000 ...
Working packet # 275000 ...
Working packet # 300000 ...
Working packet # 325000 ...
Working packet # 350000 ...
Working packet # 375000 ...
Working packet # 400000 ...
Working packet # 425000 ...
Working packet # 450000 ...
Working packet # 475000 ...
Working packet # 500000 ...
Working packet # 525000 ...
Working packet # 550000 ...
Working packet # 575000 ...
Working packet # 600000 ...
Working packet # 625000 ...
Working packet # 650000 ...
Working packet # 675000 ...
Working packet # 700000 ...
Working packet # 725000 ...
Working packet # 750000 ...
Working packet # 775000 ...
Working packet # 800000 ...
Working packet # 825000 ...
W

In [67]:
time_start = time.time()
df = make_dataframe()
print "Time for dataframe:", time.time() - time_start

August2    224739
Energy1     79039
Energy2     71741
Home1       58810
Home2       58614
Push        32761
Kevo        21107
August1     17314
Weather      8643
Room2        8133
Room1        7728
Door1        7374
Door2        4154
Name: Name, dtype: int64
Time for dataframe: 537.221572161


In [69]:
# Run One vs All  and One vs One classification strategies
features_list = [
#     'AccessAddr', 'AdvertAddr', 'ScanAddr',
    'BLE_LL_Length', 'TxAddr', 'CompanyID',
#     'RFChannel',
    'PacketLength', 'Time', 'Assoc_Packets',
    'ADV_DIRECT_IND', 'ADV_IND', 'ADV_NONCONN_IND', 
    'ADV_SCAN_IND', 'CONNECT_REQ', 'SCAN_REQ', 'SCAN_RSP']

y_list = ["door", "lock", "plug", "temp"]

time_start = time.time()

print "One vs all"
one_vs_all_classify(df, features_list, y_list)

print "One vs one"
one_vs_one_classify(df, features_list, y_list)

print "Total time (one vs one & one vs all classification):", time.time() - time_start

One vs all
Device Type: door
Random Forest Score: 0.03878899361251508 Time:  1.25428891182
KNN Score: 0.9743994329735979 Time:  103.754995108
LDA Score: 0.03437597563136534 Time:  0.340323925018
Total time (classifiers): 105.349692822

Device Type: lock
Random Forest Score: 0.2745184072633382 Time:  1.04208707809
KNN Score: 0.9978441183667615 Time:  103.307614088
LDA Score: 0.05184663285884249 Time:  0.339652061462
Total time (classifiers): 104.689419031

Device Type: plug
Random Forest Score: 0.9999746863213318 Time:  0.951542139053
KNN Score: 0.9977470825985335 Time:  103.337182999
LDA Score: 0.0950655202382861 Time:  0.349932193756
Total time (classifiers): 104.638740063

Device Type: temp
Random Forest Score: 0.9786225983647363 Time:  1.04567193985
KNN Score: 0.9750491507260807 Time:  103.189161062
LDA Score: 0.9656915275117498 Time:  0.332150936127
Total time (classifiers): 104.567066908

Total time (one vs all_classify): 419.3985641

One vs one
Device Pair: ('door', 'lock')
Rando

In [70]:
# Run One vs All  and One vs One classification strategies
features_list = [
#     'AccessAddr', 'AdvertAddr', 'ScanAddr',
    'BLE_LL_Length', 'TxAddr', 'CompanyID',
#     'RFChannel',
    'PacketLength', 'Time', 'Assoc_Packets',
    'ADV_DIRECT_IND', 'ADV_IND', 'ADV_NONCONN_IND', 
    'ADV_SCAN_IND', 'CONNECT_REQ', 'SCAN_REQ', 'SCAN_RSP']

y_list = ["door", "lock", "temp"]

time_start = time.time()

print "One vs all"
one_vs_all_classify(df, features_list, y_list)

print "One vs one"
one_vs_one_classify(df, features_list, y_list)

print "Total time (one vs one & one vs all classification):", time.time() - time_start

One vs all
Device Type: door
Random Forest Score: 0.03878899361251508 Time:  1.13068795204
KNN Score: 0.9743994329735979 Time:  103.12498188
LDA Score: 0.03437597563136534 Time:  0.338510036469
Total time (classifiers): 104.594233036

Device Type: lock
Random Forest Score: 0.2745184072633382 Time:  0.894592046738
KNN Score: 0.9978441183667615 Time:  102.940897226
LDA Score: 0.05184663285884249 Time:  0.336610794067
Total time (classifiers): 104.172151089

Device Type: temp
Random Forest Score: 0.9786225983647363 Time:  1.03509902954
KNN Score: 0.9750491507260807 Time:  103.008431911
LDA Score: 0.9656915275117498 Time:  0.334670066833
Total time (classifiers): 104.37825489

Total time (one vs all_classify): 313.296242952

One vs one
Device Pair: ('door', 'lock')
Random Forest Score: 0.9975490731477153 Time:  0.4849858284
KNN Score: 0.9975578108548536 Time:  22.4209668636
LDA Score: 0.018143848872617337 Time:  0.160078048706
Total time (classifiers): 23.0660808086

Device Pair: ('door', 