In [26]:
import helpers
import pandas as pd
import pyshark
import numpy as np
import csv, datetime, getopt, glob, logging, os, sys, time

In [7]:
# Global Variables
ROUTER = '78:d2:94:4d:ab:3e'
WIFI_DEVICES = ['ec:1a:59:e4:fd:41', 'ec:1a:59:e4:fa:09',
                'ec:1a:59:e5:02:0d', '14:91:82:24:dd:34',
                '60:38:e0:ee:7c:e5', '14:91:82:cd:df:3d',
                'b4:75:0e:0d:94:65', 'b4:75:0e:0d:33:d5',
                '94:10:3e:2b:7a:55', '30:8c:fb:3a:1a:ad',
                'd0:73:d5:26:b8:4c', 'd0:73:d5:26:c9:27',
                'ac:84:c6:97:7c:cc', 'b0:4e:26:c5:2a:41',
                '70:4f:57:f9:e1:b8', ROUTER]

DEVICE_TYPE = {'ec:1a:59:e4:fd:41' : 'camera',
               'ec:1a:59:e4:fa:09' : 'camera',
               'ec:1a:59:e5:02:0d' : 'camera',
               '14:91:82:24:dd:34' : 'plug',
               '60:38:e0:ee:7c:e5' : 'plug',
               '14:91:82:cd:df:3d' : 'plug',
               'b4:75:0e:0d:94:65' : 'plug',
               'b4:75:0e:0d:33:d5' : 'plug',
               '94:10:3e:2b:7a:55' : 'plug',
               '30:8c:fb:3a:1a:ad' : 'camera',
               'd0:73:d5:26:b8:4c' : 'bulb', 
               'd0:73:d5:26:c9:27' : 'bulb',
               'ac:84:c6:97:7c:cc' : 'camera', 
               'b0:4e:26:c5:2a:41' : 'bulb',
               '70:4f:57:f9:e1:b8' : 'plug'}

SRC_DIR = './Source/'
DST_DIR = './Destination/'
TIMING_PKT_NUMBER = 25000
path_name = os.getcwd()
DATE = path_name[path_name.rindex('/')+1:]
PROC_TIME = "wifi_processing_time_" + DATE + ".csv"

In [11]:
def parse_packet(pkt, tgt_files_by_src, tgt_files_by_dst):
    """
    Parses a given packet and extracts the following features:
        - destination MAC address
        - source MAC address
        - time of transmission
        - packet length
    The features of the packet are written out to a csv row, which is
    in turn written out to a csv file in the given dictionaries.
    
    This code is heavily based on code written by Capt Steven Beyer.
    
    Parameters
    ----------
    pkt: (Pyshark packet object) the packet from which features will be extracted
    tgt_files_by_src: (dictionary) a dictionary of open csv files.
        The keys are device source addresses, and the values are the open csv files.
    tgt_files_by_dst: (dictionary) a dictionary of open csv files.
        The keys are device destination addresses, and the values are the open csv files.
    """
    try:
        pkt_dst = pkt.wlan.da
        pkt_src = pkt.wlan.sa
        
        if (pkt_src in WIFI_DEVICES) and (pkt_dst in WIFI_DEVICES):
            # Extract features
            pkt_time = pkt.frame_info.time_epoch
            pkt_len = pkt.length
            
            output = [pkt_time, pkt_len, pkt_src, pkt_dst]
            
            csv.writer(tgt_files_by_src[pkt_src]).writerow(output)
            csv.writer(tgt_files_by_dst[pkt_dst]).writerow(output)
    
    except AttributeError:
        print "ignored: ", pkt.number            

In [13]:
def feature_extractor(filename):
    """
    Unit that extracts wanted features out of packets in a packet capture file.
    Produces two csv files for each device in WIFI_DEVICES (see Global Variables).
    One file is for all packets where the device is the source; the other is where the device is the destination.
    
    Parameters
    ----------
    filename: (string) the absolute path of the packet capture file
    
    Output
    ------
    Source directory: (filesystem) creates a directory containing csv files for each device 
        where it is the source of the packet
    Destination directory: (filesystem) creates a directory containing csv files for each device 
        where it is the destination of the packet
    
    Returns
    -------
    none
    """
    
    # Prepare writers
    pt_file = open(PROC_TIME, 'w')
    csv.writer(pt_file).writerow(["Unit", "Total Packets Processed", "Total Process Time", "Average Process Time"])
    pt_file.close()

    # Initialize counters
    pkt_count = 0
    total_time_processing = 0
    total_time_start = time.time()

    # Initialize dicts for each device
    tgt_files_by_src = {}
    tgt_files_by_dst = {}
    
    # Initialize capture file 
    cap = pyshark.FileCapture(filename, only_summaries=False)

    # Get time of first packet
    prev_pkt_time = cap[0].frame_info.time_epoch

    # Initialize output folders
    helpers.init_dirs()
    
    # Open output files for each Wi-Fi device
    for device in WIFI_DEVICES:
        tgt_files_by_src[device] = open(SRC_DIR + device.replace(':', '.') + ".csv", 'a')
        tgt_files_by_dst[device] = open(DST_DIR + device.replace(':', '.') + ".csv", 'a')
    
    # Go through each packet in capture, and store pertinent packets to csv files
    for pkt in cap:
        pkt_count += 1

        time_start = time.time()
        if pkt.highest_layer == 'DATA':
            parse_packet(pkt, tgt_files_by_src, tgt_files_by_dst)
            total_time_processing += time.time() - time_start

    total_time = time.time() - total_time_start
    
    # Close files
    for open_file in tgt_files_by_src.values():
        open_file.close()

    for open_file in tgt_files_by_dst.values():
        open_file.close()
        
    # Calculate time variables
    final_time = time.time()
    normalized_total_time = (TIMING_PKT_NUMBER * total_time) / pkt_count
    normalized_processing_time = (TIMING_PKT_NUMBER * total_time_processing) / pkt_count

    # Print time variables
    print "Total number of packets processed: ", pkt_count
    print "Total data processing time: ", total_time
    print "Normalized total processing time per 25k packets: ", normalized_total_time
    print "Total capture file processing time: ", total_time_processing
    print "Normalized capture file processing time: ", normalized_processing_time

    # Print out time metrics to csv
    pt_file = open(PROC_TIME, 'a')
    csv.writer(pt_file).writerow(["Packet capture iteration", pkt_count, total_time_processing, normalized_processing_time])
    csv.writer(pt_file).writerow(["Component start and finish time", total_time_start, final_time, final_time-total_time_start])
    pt_file.close()

In [37]:
# def dataframe_maker(path='/root/Documents/Thesis/Code/Source'):
"""
Unit that takes all the csv files produced by the feature_extractor unit and puts them into a pandas dataframe.

Parameters
----------
path: (filesystem) the absolute path of the folder containing the csv files

Output
------
none

Returns
-------
dataframe: (pandas dataframe) a useful data structure for machine learning
"""

# Search the path for csv files
path='/root/Documents/Thesis/Code/Source'
all_csvs = glob.glob(os.path.join(path, "*.csv"))

df_from_each_file = (pd.read_csv(f) for f in all_csvs)
collected_df = pd.concat(df_from_each_file, ignore_index=True, sort=False)

collected_df.shape

(2377, 27)

In [21]:
# Main 
# feature_extractor('/root/Documents/Thesis/PCAPS/wifi-01.cap')
# dataframe_maker()

/root/Documents/Thesis/Code/Source
