In [15]:
import pyshark
import glob, os
import numpy as np
import pandas as pd
import itertools
import time
from collections import Counter

In [16]:
# Global Variables

devices_devicenames = ['August1', 'August2', 'Door1', 'Door2', 'Energy1', 'Energy2', 'Kevo', 'Push', 'Room1', 'Room2', 'Weather']
devices_publicaddrs = ['Home1', 'Home2']

id_devicenames = [['Kevo','Unikey'],
                'Eve Door 91B3',
                'Eve Door DC42',
                's',
                'Aug',
                'L402EL4',
                'Eve Energy 51C0',
                'Eve Energy 556E',
                'Eve Weather 943D',
                'Eve Room 8F24',
                'Eve Room 4A04']

BLE_DEVICES = sorted(devices_devicenames + devices_publicaddrs)

# Devices that can be identified using public (static) advertising addresses
DEVICES_PUBLICADDRS = {'ec:fe:7e:14:44:be' : 'Home1', 
                       'ec:fe:7e:14:44:a1' : 'Home2'}

# Devices that can be identified using device names
DEVICES_NAMES = {'August1': 'L402EL4',
                'August2': 'Aug',
                'Door1': 'Eve Door 91B3',
                'Door2': 'Eve Door DC42',
                'Energy1': 'Eve Energy 556E',
                'Energy2': 'Eve Energy 51C0',
                'Kevo': ['Kevo', 'Unikey'],
                'Push': 's',
                'Room1': 'Eve Room 4A04',
                'Room2': 'Eve Room 8F24',
                'Weather': 'Eve Weather 943D'}

# Just the reverse of DEVICES_NAMES
NAMES_DEVICES = {'Aug': 'August2',
                 'Eve Door 91B3': 'Door1',
                 'Eve Door DC42': 'Door2',
                 'Eve Energy 51C0': 'Energy2',
                 'Eve Energy 556E': 'Energy1',
                 'Eve Room 4A04': 'Room1',
                 'Eve Room 8F24': 'Room2',
                 'Eve Weather 943D': 'Weather',
                 'Kevo': 'Kevo',
                 'L402EL4': 'August1',
                 'Unikey': 'Kevo',
                 's': 'Push'}

DEVICE_TYPE = {'August1': 'lock',
                'August2': 'lock',
                'Door1': 'door',
                'Door2': 'door',
                'Energy1': 'plug',
                'Energy2': 'plug',
                'Home1': 'door',
                'Home2': 'door',
                'Kevo': 'lock',
                'Push': 'temp',
                'Room1': 'temp',
                'Room2': 'temp',
                'Weather': 'temp'}

TRAINING_TEST = {'August1': 'train',
                 'August2': 'test',
                 'Door1': 'train',
                 'Door2': 'test',
                 'Energy1': 'train',
                 'Energy2': 'train',
                 'Home1': 'train',
                 'Home2': 'train',
                 'Kevo': 'train',
                 'Push': 'train',
                 'Room1': 'train',
                 'Room2': 'test',
                 'Weather': 'train'}

PDU_TYPES = {0: 'ADV_IND',
             1: 'ADV_DIRECT_IND',
             2: 'ADV_NONCONN_IND',
             3: 'SCAN_REQ',
             4: 'SCAN_RSP',
             5: 'CONNECT_REQ',
             6: 'ADV_SCAN_IND'}

SRC_DIR = './BLE_Source/'
DST_DIR = './BLE_Destination/'
PCAP_DIR = '/root/Documents/Thesis/BLE_PCAPS/'
TIMING_PKT_NUMBER = 25000

FEATURES = ['Name', 'DeviceName', 'AccessAddr', 'AdvertAddr', 'BLE_LL_Length', 'PDUTypeNum', 'TxAddr', 'CompanyID','ScanAddr',
           'RFChannel', 'PacketLength', 'Time']

path_name = os.getcwd()
DATE = path_name[path_name.rindex('/')+1:]
PROC_TIME = "ble_processing_time_" + DATE + ".csv"

In [17]:
def count_assoc_pkts(df, device):
    """
    Gets the count of packets of a given device that are sent within a second of each other (associated packets)
    
    Parameters
    ----------
    df: (dataframe) the dataframe containing the packet information
    device: (string) the name of the device for which the assoc_pkt count will be calculated
    
    Output
    ------
    None
    
    Returns
    -------
    assoc_count: (pandas series) contains the assoc_packet count for each packet. 
                Uses the index of the packet from the dataframe
    """
        
    ASSOC_PKT_THRESHOLD = 1 # the threshold in seconds within which a packet will be considered an assoc_pkt

    # Extract time values of all packets belonging to a certain device
    df_device = df[df["Name"]==device]
    pkt_time_values = np.array(df_device["Time"].values)
    
    assoc_pkt_counts = []
    
    # Iterate through each packet of the device
    for pkt_index in range(0,len(df_device)):  

        # Create an array of size=len(pkt_time_values) that contains the time value of packet X
        pkt_time = np.full((len(pkt_time_values),),df_device.iloc[pkt_index]["Time"])

        # Calculate the time difference between packet X and all other packets
        diff = np.abs(np.subtract(pkt_time, pkt_time_values))

        # Calculate the count of packets that would be considered an assoc_pkt based on ASSOC_PKT_THRESHOLD
        assoc_pkts = (diff <= ASSOC_PKT_THRESHOLD).sum()
        assoc_pkt_counts.append(assoc_pkts)
        
    
    assoc_count = pd.Series(assoc_pkt_counts, index=df_device.index)
    return assoc_count

In [18]:
def make_dataframe(path='/root/Documents/Thesis/Code/BLE_Source'):
    """
    Unit that takes all the csv files produced by the feature_extractor unit and puts them into a pandas dataframe.
    Returns a clean dataframe with all good data

    Parameters
    ----------
    path: (filesystem) the absolute path of the folder containing the csv files

    Output
    ------
    none

    Returns
    -------
    dataframe: (pandas dataframe) a useful data structure for machine learning
    counts: (pandas series) packet counts for each device 
    """
    
    # Search the path for csv files
    all_csvs = glob.glob(os.path.join(path, "*.csv"))

    # Collect all csvs in one dataframe
    df_from_each_file = (pd.read_csv(f) for f in all_csvs)
    df = pd.concat(df_from_each_file, ignore_index=True, sort=False)

    # Add device type of each packet
    df["DeviceType"] = df["Name"].map(DEVICE_TYPE)
        
    # Add whether device is a training or test device
    df["Set"] = df["Name"].map(TRAINING_TEST)
    
    # One-hot encode device type (response variable)
    deviceType_series = pd.get_dummies(df["DeviceType"])
    df = pd.concat([df, deviceType_series], axis=1)
    
    # TODO: One-hot encode company ID 
    
    # One-hot encode PDU_type
    df["PDUType"] = df["PDUTypeNum"].map(PDU_TYPES)
    pduType_series = pd.get_dummies(df["PDUType"])
    df = pd.concat([df, pduType_series], axis=1)
    
    # Get number of associated packets for each packet
    list_assoc_pkts = []
#     for device in list(df["Name"].unique()):
    for device in BLE_DEVICES:
        assoc_pkts = count_assoc_pkts(df, device)
        list_assoc_pkts.append(assoc_pkts)
    df["Assoc_Packets"] = pd.concat(list_assoc_pkts)
    
    # Fill NaNs with 0
    df["CompanyID"].fillna
    
    # Count packets for each device
    device_counts = df["Name"].value_counts()
    print device_counts
        
    return df


In [4]:
# cap = pyshark.FileCapture('/root/Documents/Thesis/PCAPS/master.cap', only_summaries=False)
cap = pyshark.FileCapture('/root/Documents/Thesis/BLE_PCAPS/home1home2-15min.pcap')

SCAN_RSP = '4'
s_adv_addr = 'd8:10:ed:43:60:ac'

DEVICES_PUBLICADDRS = {'ec:fe:7e:14:44:be' : 'Home1', 
                       'ec:fe:7e:14:44:a1' : 'Home2'}

In [7]:
df = make_dataframe()

August2    10466
Energy1     4396
Energy2     3997
Home1       3099
Home2       3095
August1     1819
Push        1529
Kevo        1230
Door1        520
Door2        453
Weather      414
Room1        342
Room2        272
Name: Name, dtype: int64


In [14]:
df["CompanyID"] = df["CompanyID"].fillna(0)
df["CompanyID"].value_counts()


0.0        13755
465.0      12219
133.0       4068
46966.0     1207
51062.0      208
46967.0      112
449.0         16
209.0         16
8657.0        11
2513.0         4
8837.0         2
33233.0        2
149.0          2
49014.0        2
389.0          1
8401.0         1
467.0          1
193.0          1
977.0          1
18053.0        1
9041.0         1
9429.0         1
Name: CompanyID, dtype: int64