In [1]:
import csv, datetime, getopt, glob, itertools, logging, os, sys, time
import helpers
import numpy as np
import pandas as pd
import pyshark

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
# Global Variables
ROUTER = '78:d2:94:4d:ab:3e'
WIFI_DEVICES = ['ec:1a:59:e4:fd:41', 'ec:1a:59:e4:fa:09',
                'ec:1a:59:e5:02:0d', '14:91:82:24:dd:35',
                '60:38:e0:ee:7c:e5', '14:91:82:cd:df:3d',
                'b4:75:0e:0d:94:65', 'b4:75:0e:0d:33:d5',
                '94:10:3e:2b:7a:55', '30:8c:fb:3a:1a:ad',
                'd0:73:d5:26:b8:4c', 'd0:73:d5:26:c9:27',
                'ac:84:c6:97:7c:cc', 'b0:4e:26:c5:2a:41',
                '70:4f:57:f9:e1:b8', ROUTER]

DEVICE_NAME = {'ec:1a:59:e4:fd:41' : 'Netcam1', 
               'ec:1a:59:e4:fa:09' : 'Netcam2',
               'ec:1a:59:e5:02:0d' : 'Netcam3',
               '14:91:82:24:dd:35' : 'Insight',
               '60:38:e0:ee:7c:e5' : 'Mini',
               '14:91:82:cd:df:3d' : 'Switch1',
               'b4:75:0e:0d:94:65' : 'Switch2',
               'b4:75:0e:0d:33:d5' : 'Switch3',
               '94:10:3e:2b:7a:55' : 'Switch4',
               '30:8c:fb:3a:1a:ad' : 'Dropcam',
               'd0:73:d5:26:b8:4c' : 'Lifx1', 
               'd0:73:d5:26:c9:27' : 'Lifx2',
               'ac:84:c6:97:7c:cc' : 'Kasa', 
               'b0:4e:26:c5:2a:41' : 'TpBulb',
               '70:4f:57:f9:e1:b8' : 'TpPlug',
                ROUTER : 'Router'}

DEVICE_TYPE = {'ec:1a:59:e4:fd:41' : 'camera',
               'ec:1a:59:e4:fa:09' : 'camera',
               'ec:1a:59:e5:02:0d' : 'camera',
               '14:91:82:24:dd:35' : 'plug',
               '60:38:e0:ee:7c:e5' : 'plug',
               '14:91:82:cd:df:3d' : 'plug',
               'b4:75:0e:0d:94:65' : 'plug',
               'b4:75:0e:0d:33:d5' : 'plug',
               '94:10:3e:2b:7a:55' : 'plug',
               '30:8c:fb:3a:1a:ad' : 'camera',
               'd0:73:d5:26:b8:4c' : 'bulb', 
               'd0:73:d5:26:c9:27' : 'bulb',
               'ac:84:c6:97:7c:cc' : 'camera', 
               'b0:4e:26:c5:2a:41' : 'bulb',
               '70:4f:57:f9:e1:b8' : 'plug',
                ROUTER : 'router'}

TRAINING_TEST = {'ec:1a:59:e4:fd:41' : 'train', 
                 'ec:1a:59:e4:fa:09' : 'train',
                 'ec:1a:59:e5:02:0d' : 'test',
                 '14:91:82:24:dd:35' : 'train',
                 '60:38:e0:ee:7c:e5' : 'train',
                 '14:91:82:cd:df:3d' : 'train',
                 'b4:75:0e:0d:94:65' : 'train',
                 'b4:75:0e:0d:33:d5' : 'train',
                 '94:10:3e:2b:7a:55' : 'test',
                 '30:8c:fb:3a:1a:ad' : 'train',
                 'd0:73:d5:26:b8:4c' : 'train', 
                 'd0:73:d5:26:c9:27' : 'test',
                 'ac:84:c6:97:7c:cc' : 'test', 
                 'b0:4e:26:c5:2a:41' : 'train',
                 '70:4f:57:f9:e1:b8' : 'test'}

DATA_PKT_SUBTYPES = {32 : 'Data',
                     40 : 'QoS_Data',
                     44 : 'QoS_Null'}

FEATURES = ["Time", "PacketLength", "Duration", "SourceAddr", "DestAddr", "SubtypeNum"]
SRC_DIR = './Source/'
DST_DIR = './Destination/'
PCAP_DIR = '/root/Documents/Thesis/PCAPS'
TIMING_PKT_NUMBER = 25000
DATA_FRAME_TYPE = '2'

path_name = os.getcwd()
DATE = path_name[path_name.rindex('/')+1:]
PROC_TIME = "wifi_processing_time_" + DATE + ".csv"

In [3]:
def parse_packet(pkt, tgt_files_by_src):
    """
    Parses a given packet and extracts the following features:
        - destination MAC address
        - source MAC address
        - time of transmission
        - packet length
        
    The features of the packet are written out to a csv row, which is
    in turn written out to a csv file in the given dictionaries.
    
    This code is heavily based on code written by Capt Steven Beyer.
    
    Parameters
    ----------
    pkt: (Pyshark packet object) the packet from which features will be extracted
    tgt_files_by_src: (dictionary) a dictionary of open csv files.
        The keys are device source addresses, and the values are the open csv files.
    tgt_files_by_dst: (dictionary) a dictionary of open csv files.
        The keys are device destination addresses, and the values are the open csv files.
    """
    try:
        pkt_dst = pkt.wlan.da
        pkt_src = pkt.wlan.sa
        
#         if (pkt_src in WIFI_DEVICES) and (pkt_dst in WIFI_DEVICES):
        if (pkt_src in WIFI_DEVICES):
            # Extract features
            pkt_time = pkt.frame_info.time_epoch
            pkt_len = pkt.length
            pkt_duration = pkt.wlan.duration
            pkt_subtype_num = pkt.wlan.fc_type_subtype
            
            # Output matches FEATURES
            output = [pkt_time, pkt_len, pkt_duration, pkt_src, pkt_dst, pkt_subtype_num]
            
            csv.writer(tgt_files_by_src[pkt_src]).writerow(output)            
    
    except AttributeError:
        print "ignored: ", pkt.number            

In [4]:
def get_mac_vendors():
    """
    Uses the macvendors.co API to lookup the vendors of Wi-Fi devices.
    Requires internet access.
    
    Parameters
    ----------
    None
    
    Output
    ------
    None
    
    Returns
    -------
    device_vendors (dict): keys(str) = WIFI_DEVICES MAC addresses, values(str) = vendor names
    """
    import json, requests

    # Get JSON response from API
    vendors_json = []
    for addr in WIFI_DEVICES:
        response = requests.get('http://macvendors.co/api/' + addr).text
        vendors_json.append(response)

    # Extracting company from API response
    vendors = []
    for vendor_json in vendors_json:
        response = json.loads(vendor_json)
        company = str(response['result']['company']).split(' ',1)[0].capitalize()
        vendors.append(company)

    # Put device MAC addresses and vendors into dictionary
    device_vendors = dict(zip(WIFI_DEVICES, vendors))
    
    return device_vendors

In [5]:
def count_assoc_pkts(df, device):
    """
    Gets the count of packets of a given device that are sent within a second of each other (associated packets)
    
    Parameters
    ----------
    df: (dataframe) the dataframe containing the packet information
    device: (string) the name of the device for which the assoc_pkt count will be calculated
    
    Output
    ------
    None
    
    Returns
    -------
    assoc_count: (pandas series) contains the assoc_packet count for each packet. 
                Uses the index of the packet from the dataframe
    """
        
    ASSOC_PKT_THRESHOLD = 1 # the threshold in seconds within which a packet will be considered an assoc_pkt

    # Extract time values of all packets belonging to a certain device
    df_device = df[df["Name"]==device]
    pkt_time_values = np.array(df_device["Time"].values)
    
    assoc_pkt_counts = []
    
    # Iterate through each packet of the device
    for pkt_index in range(0,len(df_device)):  

        # Create an array of size=len(pkt_time_values) that contains the time value of packet X
        pkt_time = np.full((len(pkt_time_values),),df_device.iloc[pkt_index]["Time"])

        # Calculate the time difference between packet X and all other packets
        diff = np.abs(np.subtract(pkt_time, pkt_time_values))

        # Calculate the count of packets that would be considered an assoc_pkt based on ASSOC_PKT_THRESHOLD
        assoc_pkts = (diff <= ASSOC_PKT_THRESHOLD).sum()
        assoc_pkt_counts.append(assoc_pkts)
        
    
    assoc_count = pd.Series(assoc_pkt_counts, index=df_device.index)
    return assoc_count

In [18]:
def wifi_extract_packet_features(filename = os.path.join(PCAP_DIR, 'master.cap'), create_master=True):
    """
    Unit that extracts wanted features out of packets in a packet capture file.
    The feature_extractor focuses on features derived from packet information. 
    Secondary features are processed by the make_dataframe function.
    Produces two csv files for each device in WIFI_DEVICES (see Global Variables).
    One file is for all packets where the device is the source; the other is where the device is the destination.
    
    Parameters
    ----------
    filename: (string) the absolute path of the packet capture file
    
    Output
    ------
    Source directory: (filesystem) creates a directory containing csv files for each device 
        where it is the source of the packet
    Destination directory: (filesystem) creates a directory containing csv files for each device 
        where it is the destination of the packet
    
    Returns
    -------
    none
    """
    
    # Prepare writers
    pt_file = open(PROC_TIME, 'w')
    csv.writer(pt_file).writerow(["Unit", "Total Packets Processed", "Total Process Time", "Average Process Time"])
    pt_file.close()

    # Initialize counters
    pkt_count = 0
    total_time_processing = 0
    total_time_start = time.time()

    # Initialize dicts for each device
    tgt_files_by_src = {}
    
    # Combine all pcaps in directory in one master pcap
    if (create_master):
        try:
            if os.path.exists("/root/Documents/Thesis/PCAPS/master.cap"):
                os.remove("/root/Documents/Thesis/PCAPS/master.cap")
                
            ret = os.system('mergecap /root/Documents/Thesis/PCAPS/wifi* -w /root/Documents/Thesis/PCAPS/master.cap')
            if ret != 0:
                raise OSError
        except OSError:
            print 'Could not make master capture file'

    # Initialize capture file 
    cap = pyshark.FileCapture(filename, only_summaries=False)

    # Get time of first packet
    prev_pkt_time = cap[0].frame_info.time_epoch

    # Initialize output folders
    helpers.init_dirs('wifi')
    
    # Open output files for each Wi-Fi device
    for device in WIFI_DEVICES:
        tgt_files_by_src[device] = open(SRC_DIR + device.replace(':', '.') + ".csv", 'a')
        
        # Initialize with column headers
        csv.writer(tgt_files_by_src[device]).writerow(FEATURES)
    
    # Go through each packet in capture, and store pertinent packets to csv files
    for pkt in cap:
        if pkt_count % TIMING_PKT_NUMBER == 0:
            print "Working packet #", pkt_count, "..."
        pkt_count += 1

        time_start_singlepacket = time.time()
        if pkt.wlan.fc_type == DATA_FRAME_TYPE:
            parse_packet(pkt, tgt_files_by_src)
            total_time_processing += time.time() - time_start_singlepacket

    total_time_elapsed = time.time() - total_time_start
    
    # Close files
    for open_file in tgt_files_by_src.values():
        open_file.close()
        
    # Rename files to device names for readability
    helpers.rename_csv_files(DEVICE_NAME)
        
    # Calculate time variables
    final_time = time.time()
    normalized_total_time = (TIMING_PKT_NUMBER * total_time_elapsed) / pkt_count
    normalized_processing_time = (TIMING_PKT_NUMBER * total_time_processing) / pkt_count

    # Print time variables
    print "Total number of packets processed: ", pkt_count
    print "Total data processing time: ", total_time_elapsed
    print "Normalized total processing time per 25k packets: ", normalized_total_time
    print "Total capture file processing time: ", total_time_processing
    print "Normalized capture file processing time: ", normalized_processing_time

    # Print out time metrics to csv
    pt_file = open(PROC_TIME, 'a')
    csv.writer(pt_file).writerow(["Packet capture iteration", pkt_count, total_time_processing, normalized_processing_time])
    csv.writer(pt_file).writerow(["Component start and finish time", total_time_start, final_time, final_time-total_time_start])
    pt_file.close()

In [7]:
def make_dataframe(path='/root/Documents/Thesis/Code/Source'):
    """
    Unit that takes all the csv files produced by the feature_extractor unit and puts them into a pandas dataframe.
    Returns a clean dataframe with all good data

    Parameters
    ----------
    path: (filesystem) the absolute path of the folder containing the csv files

    Output
    ------
    none

    Returns
    -------
    dataframe: (pandas dataframe) a useful data structure for machine learning
    counts: (pandas series) packet counts for each device 
    """
    
    # Search the path for csv files
#     path='/root/Documents/Thesis/Code/Source'
    all_csvs = glob.glob(os.path.join(path, "*.csv"))

    # Collect all csvs in one dataframe
    df_from_each_file = (pd.read_csv(f) for f in all_csvs)
    df = pd.concat(df_from_each_file, ignore_index=True, sort=False)

    # Add device type, device ID of each packet
    df["DeviceType"] = df["SourceAddr"].map(DEVICE_TYPE)
    df["Name"] = df["SourceAddr"].map(DEVICE_NAME)
    
    # Add whether device is a training or test device
    df["Set"] = df["SourceAddr"].map(TRAINING_TEST)
    
    # One-hot encode device type (response variable)
    deviceType_series = pd.get_dummies(df["DeviceType"])
    df = pd.concat([df, deviceType_series], axis=1)
    
    # One-hot encode MAC vendors
    df["Vendor"] = df["SourceAddr"].map(get_mac_vendors())
    vendor_series = pd.get_dummies(df["Vendor"])
    df = pd.concat([df, vendor_series], axis=1)

    # One-hot encode packet subtype
    df["Subtype"] = df["SubtypeNum"].map(DATA_PKT_SUBTYPES)
    subtype_series = pd.get_dummies(df["Subtype"])
    df = pd.concat([df, subtype_series], axis=1)   
    
    # Get number of associated packets for each packet
    list_assoc_pkts = []
    
#     for device in list(df["Name"].unique()):
    for device in DEVICE_NAME.values():
        assoc_pkts = count_assoc_pkts(df, device)
        list_assoc_pkts.append(assoc_pkts)
    df["Assoc_Packets"] = pd.concat(list_assoc_pkts)
    
    # Count packets for each device
    device_counts = df["Name"].value_counts()
    print device_counts
        
    return df


In [8]:
def random_forest_classifier(X_train, y_train, X_test, y_test):
    time_start = time.time()
    
    randomforest = RandomForestClassifier(random_state=0, n_jobs=2)
    rf_model = randomforest.fit(X_train, y_train)

    preds = rf_model.predict(X_test)
    score = rf_model.score(X_test, y_test)
    
#     print_confusion_matrix()
    
    time_elapsed = time.time() - time_start
    return {'Score' : score, 'Time' : time_elapsed}

In [9]:
def k_neighbors_classifier(X_train, y_train, X_test, y_test):
    time_start = time.time()
    
    knn = KNeighborsClassifier(n_neighbors=5, n_jobs=2)
    knn_model = knn.fit(X_train, y_train)
    
    preds = knn_model.predict(X_test)
    score = knn_model.score(X_test, y_test)
    
#     print_confusion_matrix()
    
    time_elapsed = time.time() - time_start
    return {'Score' : score, 'Time' : time_elapsed}

In [10]:
def lda_classifier(X_train, y_train, X_test, y_test):
    time_start = time.time()
    
    lda = LinearDiscriminantAnalysis()
    lda_model = lda.fit(X_train, y_train)
    
    preds = lda_model.predict(X_test)
    score = lda_model.score(X_test, y_test)
    
#     print_confusion_matrix()

    time_elapsed = time.time() - time_start
    return {'Score' : score, 'Time' : time_elapsed}

In [11]:
def one_vs_all_classify(df, features_list, y_list):
    time_start = time.time()
    
    # Divide df by train and test devices
    df_test = df[df["Set"]=="test"]
    df_train = df[df["Set"]=="train"]
    
    # Train using chosen features
    X_train = df_train[features_list]
    X_test = df_test[features_list]

    for device_type in y_list:
        # Set one device type as y
        y_train = df_train[device_type]
        y_test = df_test[device_type]

        time_start_clf = time.time()

        rf_clf = random_forest_classifier(X_train, y_train, X_test, y_test)
        knn_clf = k_neighbors_classifier(X_train, y_train, X_test, y_test)
        lda_clf = lda_classifier(X_train, y_train, X_test, y_test)

        time_elapsed_clf = time.time() - time_start_clf

        print "Device Type:", device_type
        print "Random Forest Score:", rf_clf['Score'], "Time: ", rf_clf['Time']
        print "KNN Score:", knn_clf['Score'], "Time: ", knn_clf['Time']
        print "LDA Score:", lda_clf['Score'], "Time: ", lda_clf['Time']
        print "Total time (classifiers):", time_elapsed_clf
        print ""
    
    print "Total time (one vs all_classify):", time.time() - time_start
    print ""

In [12]:
def one_vs_one_classify(df, features_list, y_list):
    time_start = time.time()
    
    # Get possible combinations for one vs one
    combinations = [combination for combination in itertools.combinations(y_list, 2)]

    for device_pair in combinations:
        # Only use data with the two device types needed for one vs one classification
        pos_device_type = device_pair[0]
        neg_device_type = device_pair[1]
        df_1v1 = df[(df["DeviceType"]==pos_device_type) | (df["DeviceType"]==neg_device_type)]

        # Separate df into train and test sets
        df_train = df_1v1[df_1v1["Set"]=="train"]
        df_test = df_1v1[df_1v1["Set"]=="test"]
        X_train = df_train[features_list]
        X_test = df_test[features_list]
        y_train = df_train[pos_device_type]
        y_test = df_test[pos_device_type]
        
        time_start_clf = time.time()

        rf_clf = random_forest_classifier(X_train, y_train, X_test, y_test)
        knn_clf = k_neighbors_classifier(X_train, y_train, X_test, y_test)
        lda_clf = lda_classifier(X_train, y_train, X_test, y_test)

        time_elapsed_clf = time.time() - time_start_clf

        print "Device Pair:", device_pair
        print "Random Forest Score:", rf_clf['Score'], "Time: ", rf_clf['Time']
        print "KNN Score:", knn_clf['Score'], "Time: ", knn_clf['Time']
        print "LDA Score:", lda_clf['Score'], "Time: ", lda_clf['Time']
        print "Total time (classifiers):", time_elapsed_clf
        print ""
    
    print "Total time (one vs one_classify):", time.time() - time_start
    print ""

In [13]:
def print_confusion_matrix():
    return "print_confusion_matrix goes here"

In [20]:
# Main 
pcap_path = os.path.join(PCAP_DIR, 'master.cap')
# feature_extractor(pcap_path)
wifi_extract_packet_features(create_master=True)

Old ./Source deleted
Old ./Destination deleted
Total number of packets processed:  1375941
Total data processing time:  1651.73450613
Normalized total processing time per 25k packets:  30.0109980393
Total capture file processing time:  159.951148272
Normalized capture file processing time:  2.90621378881


In [14]:
df = make_dataframe()

# Limit to two device types
# df = df[(df["DeviceType"]!="bulb") & (df["DeviceType"]!="router")]

# Take out packets from router
df = df[df["DeviceType"]!="router"]

Mini       104280
Router     103593
Dropcam     64568
Kasa        23753
Netcam3      4867
Netcam1      4446
Netcam2      4407
Switch2      3046
Switch1      2668
Switch3      2634
Insight      2556
Switch4      2206
Lifx2         627
TpPlug        587
Lifx1         540
TpBulb        202
Name: Name, dtype: int64


In [28]:
# Run One vs All  and One vs One classification strategies
features_list = [
        # Packet info
        "PacketLength", "Duration", 
        
        # Vendor 
         "Belkin", "Dropcam", "Lifi", "Netgear", "Tp-link",
    
        # 802.11 Data subtype
        "Data", "QoS_Data", "QoS_Null",

        # Associated Packets
        "Assoc_Packets"]

y_list = ["camera", "bulb", "plug"]

time_start = time.time()

print "One vs all"
one_vs_all_classify(df, features_list, y_list)

print "One vs one"
one_vs_one_classify(df, features_list, y_list)

print "Total time (one vs one & one vs all classification):", time.time() - time_start

One vs all
Device Type: camera
Random Forest Score: 0.2422596754057428 Time:  0.523423194885
KNN Score: 0.2833645443196005 Time:  96.3279111385
LDA Score: 0.16086142322097377 Time:  0.143582105637
Total time (classifiers): 96.9950249195

Device Type: bulb
Random Forest Score: 0.9948813982521848 Time:  0.538688182831
KNN Score: 0.9814294631710362 Time:  96.1647100449
LDA Score: 1.0 Time:  0.13214802742
Total time (classifiers): 96.8356180191

Device Type: plug
Random Forest Score: 0.9485018726591761 Time:  0.669028997421
KNN Score: 0.26523096129837703 Time:  96.5410609245
LDA Score: 0.8838639200998751 Time:  0.131011009216
Total time (classifiers): 97.3411831856

Total time (one vs all_classify): 291.240507126

One vs one
Device Pair: ('camera', 'bulb')
Random Forest Score: 0.748452832769173 Time:  0.431248188019
KNN Score: 0.9996580845898725 Time:  9.26896500587
LDA Score: 1.0 Time:  0.0487349033356
Total time (classifiers): 9.74901199341

Device Pair: ('camera', 'plug')
Random Forest 