In [101]:
import csv, datetime, getopt, glob, logging, os, sys, time
import helpers
import numpy as np
import pandas as pd
import pyshark

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit

In [102]:
# Global Variables
ROUTER = '78:d2:94:4d:ab:3e'
WIFI_DEVICES = ['ec:1a:59:e4:fd:41', 'ec:1a:59:e4:fa:09',
                'ec:1a:59:e5:02:0d', '14:91:82:24:dd:35',
                '60:38:e0:ee:7c:e5', '14:91:82:cd:df:3d',
                'b4:75:0e:0d:94:65', 'b4:75:0e:0d:33:d5',
                '94:10:3e:2b:7a:55', '30:8c:fb:3a:1a:ad',
                'd0:73:d5:26:b8:4c', 'd0:73:d5:26:c9:27',
                'ac:84:c6:97:7c:cc', 'b0:4e:26:c5:2a:41',
                '70:4f:57:f9:e1:b8', ROUTER]

DEVICE_TYPE = {'ec:1a:59:e4:fd:41' : 'camera',
               'ec:1a:59:e4:fa:09' : 'camera',
               'ec:1a:59:e5:02:0d' : 'camera',
               '14:91:82:24:dd:35' : 'plug',
               '60:38:e0:ee:7c:e5' : 'plug',
               '14:91:82:cd:df:3d' : 'plug',
               'b4:75:0e:0d:94:65' : 'plug',
               'b4:75:0e:0d:33:d5' : 'plug',
               '94:10:3e:2b:7a:55' : 'plug',
               '30:8c:fb:3a:1a:ad' : 'camera',
               'd0:73:d5:26:b8:4c' : 'bulb', 
               'd0:73:d5:26:c9:27' : 'bulb',
               'ac:84:c6:97:7c:cc' : 'camera', 
               'b0:4e:26:c5:2a:41' : 'bulb',
               '70:4f:57:f9:e1:b8' : 'plug',
                ROUTER : 'router'}

DEVICE_NAME = {'ec:1a:59:e4:fd:41' : 'Netcam1', 
               'ec:1a:59:e4:fa:09' : 'Netcam2',
               'ec:1a:59:e5:02:0d' : 'Netcam3',
               '14:91:82:24:dd:35' : 'Insight',
               '60:38:e0:ee:7c:e5' : 'Mini',
               '14:91:82:cd:df:3d' : 'Switch1',
               'b4:75:0e:0d:94:65' : 'Switch2',
               'b4:75:0e:0d:33:d5' : 'Switch3',
               '94:10:3e:2b:7a:55' : 'Switch4',
               '30:8c:fb:3a:1a:ad' : 'Dropcam',
               'd0:73:d5:26:b8:4c' : 'Lifx1', 
               'd0:73:d5:26:c9:27' : 'Lifx2',
               'ac:84:c6:97:7c:cc' : 'Kasa', 
               'b0:4e:26:c5:2a:41' : 'TpBulb',
               '70:4f:57:f9:e1:b8' : 'TpPlug',
                ROUTER : 'Router'}

TRAINING_TEST = {'ec:1a:59:e4:fd:41' : 'train', 
                 'ec:1a:59:e4:fa:09' : 'train',
                 'ec:1a:59:e5:02:0d' : 'test',
                 '14:91:82:24:dd:35' : 'train',
                 '60:38:e0:ee:7c:e5' : 'train',
                 '14:91:82:cd:df:3d' : 'train',
                 'b4:75:0e:0d:94:65' : 'train',
                 'b4:75:0e:0d:33:d5' : 'train',
                 '94:10:3e:2b:7a:55' : 'test',
                 '30:8c:fb:3a:1a:ad' : 'train',
                 'd0:73:d5:26:b8:4c' : 'train', 
                 'd0:73:d5:26:c9:27' : 'test',
                 'ac:84:c6:97:7c:cc' : 'test', 
                 'b0:4e:26:c5:2a:41' : 'train',
                 '70:4f:57:f9:e1:b8' : 'test',
}

FEATURES = ["Time", "PacketLength", "Duration", "SourceAddr", "DestAddr", "SubtypeNum"]
SRC_DIR = './Source/'
DST_DIR = './Destination/'
PCAP_DIR = '/root/Documents/Thesis/PCAPS'
TIMING_PKT_NUMBER = 25000
DATA_FRAME_TYPE = '2'

path_name = os.getcwd()
DATE = path_name[path_name.rindex('/')+1:]
PROC_TIME = "wifi_processing_time_" + DATE + ".csv"

In [103]:
def parse_packet(pkt, tgt_files_by_src, tgt_files_by_dst):
    """
    Parses a given packet and extracts the following features:
        - destination MAC address
        - source MAC address
        - time of transmission
        - packet length
        
    The features of the packet are written out to a csv row, which is
    in turn written out to a csv file in the given dictionaries.
    
    This code is heavily based on code written by Capt Steven Beyer.
    
    Parameters
    ----------
    pkt: (Pyshark packet object) the packet from which features will be extracted
    tgt_files_by_src: (dictionary) a dictionary of open csv files.
        The keys are device source addresses, and the values are the open csv files.
    tgt_files_by_dst: (dictionary) a dictionary of open csv files.
        The keys are device destination addresses, and the values are the open csv files.
    """
    try:
        pkt_dst = pkt.wlan.da
        pkt_src = pkt.wlan.sa
        
#         if (pkt_src in WIFI_DEVICES) and (pkt_dst in WIFI_DEVICES):
        if (pkt_src in WIFI_DEVICES):
            # Extract features
            pkt_time = pkt.frame_info.time_epoch
            pkt_len = pkt.length
            pkt_duration = pkt.wlan.duration
            pkt_subtype_num = pkt.wlan.fc_type_subtype
            
            # Output matches FEATURES
            output = [pkt_time, pkt_len, pkt_duration, pkt_src, pkt_dst, pkt_subtype_num]
            
            csv.writer(tgt_files_by_src[pkt_src]).writerow(output)
#             csv.writer(tgt_files_by_dst[pkt_dst]).writerow(output)
            
    
    except AttributeError:
        print "ignored: ", pkt.number            

In [104]:
def get_mac_vendors():
    """
    Uses the macvendors.co API to lookup the vendors of Wi-Fi devices.
    Requires internet access.
    
    Parameters
    ----------
    None
    
    Output
    ------
    None
    
    Returns
    -------
    device_vendors (dict): keys(str) = WIFI_DEVICES MAC addresses, values(str) = vendor names
    """
    import json, requests

    # Get JSON response from API
    vendors_json = []
    for addr in WIFI_DEVICES:
        response = requests.get('http://macvendors.co/api/' + addr).text
        vendors_json.append(response)

    # Extracting company from API response
    vendors = []
    for vendor_json in vendors_json:
        response = json.loads(vendor_json)
        company = str(response['result']['company']).split(' ',1)[0].capitalize()
        vendors.append(company)

    # Put device MAC addresses and vendors into dictionary
    device_vendors = dict(zip(WIFI_DEVICES, vendors))
    
    return device_vendors

In [105]:
def feature_extractor(filename = os.path.join(PCAP_DIR, 'master.cap'), create_master=True):
    """
    Unit that extracts wanted features out of packets in a packet capture file.
    The feature_extractor focuses on features derived from packet information. 
    Secondary features will be processed by the make_dataframe function.
    Produces two csv files for each device in WIFI_DEVICES (see Global Variables).
    One file is for all packets where the device is the source; the other is where the device is the destination.
    
    Parameters
    ----------
    filename: (string) the absolute path of the packet capture file
    
    Output
    ------
    Source directory: (filesystem) creates a directory containing csv files for each device 
        where it is the source of the packet
    Destination directory: (filesystem) creates a directory containing csv files for each device 
        where it is the destination of the packet
    
    Returns
    -------
    none
    """
    
    # Prepare writers
    pt_file = open(PROC_TIME, 'w')
    csv.writer(pt_file).writerow(["Unit", "Total Packets Processed", "Total Process Time", "Average Process Time"])
    pt_file.close()

    # Initialize counters
    pkt_count = 0
    total_time_processing = 0
    total_time_start = time.time()

    # Initialize dicts for each device
    tgt_files_by_src = {}
    tgt_files_by_dst = {}
    
    # Combine all pcaps in directory in one master pcap
    if (create_master):
        try:
            ret = os.system('mergecap /root/Documents/Thesis/PCAPS/wifi* -w /root/Documents/Thesis/PCAPS/master.cap')
            if ret != 0:
                raise OSError
        except OSError:
            print 'Could not make master capture file'

    # Initialize capture file 
    cap = pyshark.FileCapture(filename, only_summaries=False)

    # Get time of first packet
    prev_pkt_time = cap[0].frame_info.time_epoch

    # Initialize output folders
    helpers.init_dirs()
    
    # Open output files for each Wi-Fi device
    for device in WIFI_DEVICES:
        tgt_files_by_src[device] = open(SRC_DIR + device.replace(':', '.') + ".csv", 'a')
        tgt_files_by_dst[device] = open(DST_DIR + device.replace(':', '.') + ".csv", 'a')

        
        # Initialize with column headers
        csv.writer(tgt_files_by_src[device]).writerow(FEATURES)
        csv.writer(tgt_files_by_dst[device]).writerow(FEATURES)
    
    # Go through each packet in capture, and store pertinent packets to csv files
    for pkt in cap:
        pkt_count += 1

        time_start = time.time()
        if pkt.wlan.fc_type == DATA_FRAME_TYPE:
            parse_packet(pkt, tgt_files_by_src, tgt_files_by_dst)
            total_time_processing += time.time() - time_start

    total_time = time.time() - total_time_start
    
    # Close files
    for open_file in tgt_files_by_src.values():
        open_file.close()

    for open_file in tgt_files_by_dst.values():
        open_file.close()
        
    # Rename files to device names for readability
    helpers.rename_csv_files(DEVICE_NAME)
        
    # Calculate time variables
    final_time = time.time()
    normalized_total_time = (TIMING_PKT_NUMBER * total_time) / pkt_count
    normalized_processing_time = (TIMING_PKT_NUMBER * total_time_processing) / pkt_count

    # Print time variables
    print "Total number of packets processed: ", pkt_count
    print "Total data processing time: ", total_time
    print "Normalized total processing time per 25k packets: ", normalized_total_time
    print "Total capture file processing time: ", total_time_processing
    print "Normalized capture file processing time: ", normalized_processing_time

    # Print out time metrics to csv
    pt_file = open(PROC_TIME, 'a')
    csv.writer(pt_file).writerow(["Packet capture iteration", pkt_count, total_time_processing, normalized_processing_time])
    csv.writer(pt_file).writerow(["Component start and finish time", total_time_start, final_time, final_time-total_time_start])
    pt_file.close()

In [106]:
def make_dataframe(path='/root/Documents/Thesis/Code/Source'):
    """
    Unit that takes all the csv files produced by the feature_extractor unit and puts them into a pandas dataframe.
    Returns a clean dataframe with all good data

    Parameters
    ----------
    path: (filesystem) the absolute path of the folder containing the csv files

    Output
    ------
    none

    Returns
    -------
    dataframe: (pandas dataframe) a useful data structure for machine learning
    counts: (pandas series) packet counts for each device 
    """

    
    DATA_PKT_SUBTYPES = {32 : 'Data',
                         40 : 'QoS_Data',
                         44 : 'QoS_Null'}
    
    # Search the path for csv files
#     path='/root/Documents/Thesis/Code/Source'
    all_csvs = glob.glob(os.path.join(path, "*.csv"))

    # Collect all csvs in one dataframe
    df_from_each_file = (pd.read_csv(f) for f in all_csvs)
    df = pd.concat(df_from_each_file, ignore_index=True, sort=False)

    # Add device type, device ID of each packet
    df["DeviceType"] = df["SourceAddr"].map(DEVICE_TYPE)
    df["Name"] = df["SourceAddr"].map(DEVICE_NAME)
    
    # Add whether device is a training or test device
    df["Set"] = df["SourceAddr"].map(TRAINING_TEST)
    
    # Add MAC vendors as one-hot encoding
    df["Vendor"] = df["SourceAddr"].map(get_mac_vendors())
    vendor_series = pd.get_dummies(df["Vendor"])
    df = pd.concat([df, vendor_series], axis=1)

    # One-hot encode packet subtype
    df["Subtype"] = df["SubtypeNum"].map(DATA_PKT_SUBTYPES)
    subtype_series = pd.get_dummies(df["Subtype"])
    df = pd.concat([df, subtype_series], axis=1)
    
    # Set addresses as categorical data
    
    
    # Count packets for each device
    device_counts = df["Name"].value_counts()
    print device_counts
        
    return df


In [107]:
def random_forest_classifier(X_train, y_train, X_test, y_test):
    time_start = time.time()
    
    randomforest = RandomForestClassifier(random_state=0, n_jobs=2)
    rf_model = randomforest.fit(X_train, y_train)

    preds = rf_model.predict(X_test)
    score = rf_model.score(X_test, y_test)
    
    time_elapsed = time.time() - time_start
    return score, time_elapsed

In [108]:
def k_neighbors_classifier(X_train, y_train, X_test, y_test):
    time_start = time.time()
    
    knn = KNeighborsClassifier(n_neighbors=5, n_jobs=2)
    knn_model = knn.fit(X_train, y_train)
    
    preds = knn_model.predict(X_test)
    score = knn_model.score(X_test, y_test)
    
    time_elapsed = time.time() - time_start
    return score, time_elapsed

In [109]:
def lda_classifier(X_train, y_train, X_test, y_test):
    time_start = time.time()
    
    lda = LinearDiscriminantAnalysis()
    lda_model = lda.fit(X_train, y_train)
    
    preds = lda_model.predict(X_test)
    score = lda_model.score(X_test, y_test)
    
    time_elapsed = time.time() - time_start
    return score, time_elapsed

In [70]:
# Main 
pcap_path = os.path.join(PCAP_DIR, 'master.cap')
# feature_extractor(pcap_path)
feature_extractor(create_master=False)
# df = make_dataframe()

Old ./Source deleted
Old ./Destination deleted
Total number of packets processed:  1011608
Total data processing time:  1125.12533116
Normalized total processing time per 25k packets:  27.8053685608
Total capture file processing time:  96.6678524017
Normalized capture file processing time:  2.38896520198


I'm just testing some code here

In [110]:
df = make_dataframe()

# Limit to two device types
# df = df[(df["DeviceType"]!="bulb") & (df["DeviceType"]!="router")]

# Take out packets from router
df = df[df["DeviceType"]!="router"]


Mini       67881
Router     65187
Dropcam    42359
Kasa       15465
Netcam3     3596
Netcam2     2946
Netcam1     2912
Switch2     2111
Switch1     1763
Insight     1701
Switch3     1677
Switch4     1432
TpPlug       419
Lifx2        203
Lifx1        164
TpBulb        19
Name: Name, dtype: int64


In [95]:
df.columns

In [114]:
# Divide into training and test sets

"""
# Use StratifiedShuffleSplit to create train and test sets
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
X = df[[
        # Packet info
        "PacketLength", "Duration", 
        
        # Vendor 
        "Belkin", "Dropcam", "Lifi", "Netgear", "Tp-link",
    
        # 802.11 Data subtype
        "Data", "QoS_Data", "QoS_Null"]]
y = df["DeviceType"]

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
"""

# Divide df by train and test devices
df_test = df[df["Set"]=="test"]
df_train = df[df["Set"]=="train"]
X_train = df_train[[
        # Packet info
        "PacketLength", "Duration", 
        
        # Vendor 
        "Belkin", "Dropcam", "Lifi", "Netgear", "Tp-link",
    
        # 802.11 Data subtype
        "Data", "QoS_Data", "QoS_Null"]]

y_train = df_train["DeviceType"]

X_test = df_test[[
        # Packet info
        "PacketLength", "Duration", 
        
        # Vendor 
        "Belkin", "Dropcam", "Lifi", "Netgear", "Tp-link",
    
        # 802.11 Data subtype
        "Data", "QoS_Data", "QoS_Null"]]

y_test = df_test["DeviceType"]



# print X_train.shape, X_test.shape, y_train.shape, y_test.shape
# print y_train.value_counts()

(123533, 10) (21115, 10) (123533,) (21115,)
plug      75133
camera    48217
bulb        183
Name: DeviceType, dtype: int64


In [112]:
# Run classifiers
rf_clf = random_forest_classifier(X_train, y_train, X_test, y_test)
knn_clf = k_neighbors_classifier(X_train, y_train, X_test, y_test)
lda_clf = lda_classifier(X_train, y_train, X_test, y_test)

print "Random Forest Score:", rf_clf[0], "Time: ", rf_clf[1]
print "KNN Score:", knn_clf[0], "Time: ", knn_clf[1]
print "LDA Score:", lda_clf[0], "Time: ", lda_clf[1]

Random Forest Score: 0.1396163864551267 Time:  0.545241117477
KNN Score: 0.6162443760359934 Time:  40.8953938484
LDA Score: 0.8699976320151551 Time:  0.307951927185
