In [0]:
## This file handles the processing fo Joy files. It retrieves flow metadata 
## as well as data concerning 
## DP Network 2019

# Imports
from __future__ import absolute_import, division, print_function
import numpy as np
import os
import re
import tensorflow as tf
import time
import sys
import pickle
import scipy
import pandas as pd
import math
import json
import gzip

In [0]:
## Validate the adresses in the flows are
## IP4  adresses 
def validate_ip(s):
    a = s.split('.')
    if len(a) != 4:
        return False
    for x in a:
        if not x.isdigit():
            return False
        i = int(x)
        if i < 0 or i > 255:
            return False
    return True

In [0]:
class DataParser:

    def __init__(self, json_file, compact=1):
        self.flows = []
        self.compact = compact
        with gzip.open(json_file, 'r') as fp:
            try:
                for line in fp:
                    try:
                        tmp = json.loads(line)
                        if 'version' not in tmp:
                            self.flows.append(tmp)
                    except:
                        continue
            except:
                return

          
    ## Get timing information for indiivudal flows
    def getFlowSetMetadataTimings(self):
        if self.flows == []:
            return None
        data = dict()
        ## Go through each flowset
        for flow in self.flows:
            if len(flow['packets']) == 0:
                continue
            tmp = []
            time_start = 0
            if validate_ip(flow['sa']) == False or validate_ip(flow['da']) == False:
              continue
            ## Use appropriate flow key 
            key = flow['sa'].replace('.','')+flow['da'].replace('.','')+str(flow['sp'])+str(flow['dp'])+str(flow['pr'])               
            if 'time_start' in flow:
                time_start = float(flow['time_start'])
            else:
                tmp.append(0)
            times = []
            directions = []
            pkt_sizes = []
            ## Store in same place if using the same key
            if key in data:
              times = data[key][0]
              directions = data[key][1]
              pkt_sizes = data[key][2]
            else:
              times = []
              directions = []
              pkt_sizes = []
              data[key] = []
            total_time = 0
            ## Store packet information 
            if flow['packets'] != []:
              for packet in flow['packets']:
                  total_time +=(float(packet['ipt']))/1000
                  times.append(time_start + total_time+ (float(packet['ipt']))/1000)
                  pkt_sizes.append(float(packet['b']))
                  if packet['dir'] == '>':
                      directions.append(1)
                  else:
                      directions.append(-1)
            #Only get the first 64 packet information 
            data[key] = [times[:64],directions[:64],pkt_sizes[:64]]
        return data
              
            
         
    ## Get flow statistics for the joy data 
    def getFlowSetMetadata(self):
        if self.flows == []:
            print("HERE")
            return None
        data = dict()
        for flow in self.flows:
            if len(flow['packets']) == 0:
                continue
            tmp = []
            reverse = False
            if validate_ip(flow['sa']) == False or validate_ip(flow['da']) == False:
              continue
            key = flow['sa'].replace('.','')+flow['da'].replace('.','')+str(flow['sp'])+str(flow['dp'])+str(flow['pr'])
            if reverse == False:    
                if flow['dp'] != None:
                    tmp.append(int(flow['dp']))  # destination port
                else:
                    tmp.append(0)  # ICMP/e
                if flow['sp'] != None:
                    tmp.append(int(flow['sp']))  # destination port
                else:
                    tmp.append(0)  # ICMP/etc.
                if 'num_pkts_in' in flow:
                    tmp.append(flow['num_pkts_in'])  # inbound packets
                else:
                    tmp.append(0)
                if 'num_pkts_out' in flow:
                    tmp.append(flow['num_pkts_out'])  # outbound packets
                else:
                    tmp.append(0)
                if 'bytes_in' in flow:
                    tmp.append(flow['bytes_in'])  # inbound bytes
                else:
                    tmp.append(0)
                if 'bytes_out' in flow:
                    tmp.append(flow['bytes_out'])  # outbound bytes
                else:
                    tmp.append(0)
                if 'time_end' in flow:
                    tmp.append(float(flow['time_end']))
                else:
                    tmp.append(0)

                # elapsed time of flow
                directions = []
                pkt_dir_sizes =[]
                if flow['packets'] == []:

                    # # Interarrival Times
                    tmp.append([])
                    tmp.append([])
                    tmp.append([])

                    # #  Packet byte sizes
                    tmp.append([])
                    tmp.append([])
                    tmp.append([])
                else:
                    ## Get timing information for both forwards and backwards
                    times = []
                    for_times = []
                    bac_times = []
                    pkt_sizes = []
                    for_pkt_size = []
                    bac_pkt_size = []
                    for packet in flow['packets']:
                        pkt_timing = packet['ipt']
                        times.append(pkt_timing)
                        pkt_sizes.append(packet['b'])
                        if packet['dir'] == '>':
                            for_times.append(pkt_timing)
                            for_pkt_size.append(packet['b'])
                            directions.append(1)
                            pkt_dir_sizes.append(int(packet['b']))
                        else:
                            bac_times.append(pkt_timing)
                            bac_pkt_size.append(packet['b'])
                            directions.append(-1)
                            pkt_dir_sizes.append(-1*int(packet['b']))   
                    tmp.append(times)
                    tmp.append(for_times)
                    tmp.append(bac_times)
                    tmp.append(pkt_sizes)
                    tmp.append(for_pkt_size)
                    tmp.append(bac_pkt_size)
                if 'pr' in flow:
                    tmp.append(flow['pr'])  # protocol
                else:
                    tmp.append(0)
                # # Handle PPI Data
                if 'ppi' in flow:
                    ## Used mainly for the flag information
                    if flow['ppi'] == []:
                        # # BIDIR FLAGS ##
                        tmp.append(0)
                        tmp.append(0)
                        tmp.append(0)
                        tmp.append(0)

                        # # FOR FLAGS ##
                        tmp.append(0)
                        tmp.append(0)
                        tmp.append(0)
                        tmp.append(0)

                        # # BACK FLAGS ##
                        tmp.append(0)
                        tmp.append(0)
                        tmp.append(0)
                        tmp.append(0)
                    else:
                        psh_flag_count = 0
                        ack_flag_count = 0
                        syn_flag_count = 0
                        fin_flag_count = 0

                        for_psh_flag_count = 0
                        for_ack_flag_count = 0
                        for_syn_flag_count = 0
                        for_fin_flag_count = 0

                        bac_psh_flag_count = 0
                        bac_ack_flag_count = 0
                        bac_syn_flag_count = 0
                        bac_fin_flag_count = 0
                        for packetppi in flow['ppi']:
                            if packetppi['dir'] == '>':
                                if 'A' in packetppi['flags']:
                                    ack_flag_count += 1
                                    for_psh_flag_count += 1
                                if 'S' in packetppi['flags']:
                                    syn_flag_count += 1
                                    for_ack_flag_count += 1
                                if 'P' in packetppi['flags']:
                                    syn_flag_count += 1
                                    for_syn_flag_count += 1
                                if 'F' in packetppi['flags']:
                                    fin_flag_count += 1
                                    for_fin_flag_count += 1
                            else:
                                if 'A' in packetppi['flags']:
                                    ack_flag_count += 1
                                    bac_psh_flag_count += 1
                                if 'S' in packetppi['flags']:
                                    syn_flag_count += 1
                                    bac_ack_flag_count += 1
                                if 'P' in packetppi['flags']:
                                    syn_flag_count += 1
                                    bac_syn_flag_count += 1
                                if 'F' in packetppi['flags']:
                                    fin_flag_count += 1
                                    bac_fin_flag_count += 1
                        tmp.append(psh_flag_count)
                        tmp.append(ack_flag_count)
                        tmp.append(syn_flag_count)
                        tmp.append(fin_flag_count)

                        tmp.append(for_psh_flag_count)
                        tmp.append(for_ack_flag_count)
                        tmp.append(for_syn_flag_count)
                        tmp.append(for_fin_flag_count)

                        tmp.append(bac_psh_flag_count)
                        tmp.append(bac_ack_flag_count)
                        tmp.append(bac_syn_flag_count)
                        tmp.append(bac_fin_flag_count)
                else:
                    ## BIDIR FLAGS ##
                    tmp.append(0)
                    tmp.append(0)
                    tmp.append(0)
                    tmp.append(0)

                    # # FOR FLAGS ##
                    tmp.append(0)
                    tmp.append(0)
                    tmp.append(0)
                    tmp.append(0)

                    # # BACK FLAGS ##
                    tmp.append(0)
                    tmp.append(0)
                    tmp.append(0)
                    tmp.append(0)
             
            dns_ = [] ## Get DNS information 
            if 'dns' in flow:
                for domain in flow['dns']:
                    if 'qn' in domain:
                        dns_.append(domain['qn'])
            tmp.append(dns_)
            tmp.append(directions)
            tmp.append(pkt_dir_sizes)       
            if 'time_start' in flow:
                tmp.append(float(flow['time_start']))
            if key not in data:
                data[key] = [tmp]
            else:
                data[key].append(tmp)
        if len(data) == 0:
            return None
        return data

In [0]:
import scipy.stats
## Get individual stats for a given list 
def getStats(datax, storage):
    data = [float(i) for i in datax]
    if data != []:
        storage.append(sum(data))
        storage.append(min(data))
        storage.append(max(data))
        storage.append(np.average(data))
        storage.append(np.var(data))
    else:
        for i in range(0,6):
            storage.append(0)

In [0]:
def extractFlowSetFeaturesLSTM(flowSetMetadata):
    flowFeatureSet = dict()
    
    # # For each FlowSet get statistics
    for key in flowSetMetadata:
        ## Calculate Statistics for each 
        # # Calculate flowset features ##
        # # Feature set includes minium, maximum, mean
        # #  standard deviation, variance         ###
        flowFeatureSet[key] = []
        index = 0
        prvious_time = 0
        for flow in flowSetMetadata[key]:
            # # flow[0]= dp
            # # flow[1] = sp
            # # flow[2] = num_pkts_in
            # # flow[3] = num_pkts_out
            # # flow[4] = bytes_in
            # # flow[5] = bytes_out
            # # flow[6] = time_end
            # # flow[7] = bi_times
            # # flow[8] = bac_times
            # # flow[9] = for_times
            # # flow[10] = pkt_sizes
            # # flow[11] = for_pkt_size
            # # flow[12] = bac_pkt_size
            # # flow[13] = protocol
            # # flow[14] = psh_flag_count
            # # flow[15] = ack_flag_count
            # # flow[16] = syn_flag_count
            # # flow[17] = fin_flag_count
            # # flow[18] = for_psh_flag_count
            # # flow[19] = for_ack_flag_count
            # # flow[20] = for_syn_flag_count
            # # flow[21] = for_fin_flag_count
            # # flow[22] = bac_psh_flag_count
            # # flow[23] = bac_ack_flag_count
            # # flow[24] = bac_syn_flag_count
            # # flow[25] = bac_fin_flag_count
            ##  flow[26] = DNS 
            flowFeatureSetCat = []
            flowFeatureSetCont = []
            flowFeatureSetBin = []
            flow_dns = []
            ## EXTRACT FEATURE SET FOR BACKWARD DIRECTION ##
            flowFeatureSetCont.append(flow[2]) ## BACK NUMBER OF PKTS
            flowFeatureSetCont.append(flow[4]) ## BACK BYTES IN 
            getStats(flow[8],flowFeatureSetCont) ## BAC TIME
            getStats(flow[12],flowFeatureSetCont) ## BAC PKT SIZES
            flowFeatureSetCont.append(flow[22]) ## PSH FLAG COUNT
            flowFeatureSetCont.append(flow[23]) ## ACK FLAG COUNT
            flowFeatureSetCont.append(flow[24]) ## SYN FLAG COUNT
            flowFeatureSetCont.append(flow[25]) ## FIN FLAG COUNT
            
            ## EXTRACT FEATURE SET FOR FORWARD DIRECTION ##
            flowFeatureSetCont.append(flow[3]) ## FORW NUMBER OF PKTS
            flowFeatureSetCont.append(flow[5]) ## FORW BYTES IN 
            getStats(flow[9],flowFeatureSetCont) ## FORW TIME
            getStats(flow[11],flowFeatureSetCont) ## FORW PKT SIZES
            flowFeatureSetCont.append(flow[18]) ## PSH FLAG COUNT
            flowFeatureSetCont.append(flow[19]) ## ACK FLAG COUNT
            flowFeatureSetCont.append(flow[20]) ## SYN FLAG COUNT
            flowFeatureSetCont.append(flow[21]) ## FIN FLAG COUNT
            
            ## EXTRACT FEATURE SET FOR BIDIRECTION  ##
            flowFeatureSetCont.append(flow[3] +flow[2]) ## BID NUMBER OF PKTS
            flowFeatureSetCont.append(flow[5] + flow[4]) ## BID BYTES IN 
            getStats(flow[7],flowFeatureSetCont) ## BID TIME
            getStats(flow[10],flowFeatureSetCont) ## BID PKT SIZES
            flowFeatureSetCont.append(flow[14]) ## PSH FLAG COUNT
            flowFeatureSetCont.append(flow[15]) ## ACK FLAG COUNT
            flowFeatureSetCont.append(flow[16]) ## SYN FLAG COUNT
            flowFeatureSetCont.append(flow[17]) ## FIN FLAG COUNT
            if index == 0:
                flowFeatureSetCont.append(0)
                previous_time = flow[6]
            else:
                flowFeatureSetCont.append(flow[6]-previous_time)
            index+=1
            ## Get Port Feature Set
            flowFeatureSetCat.append(flow[0])
            flowFeatureSetCat.append(flow[1])
            
            ## Get Protocol Feature Set
            flowFeatureSetCat.append(flow[13])
            
            ## Get Adress Information
            flowFeatureSetBin.append(flow[27])
            flowFeatureSetBin.append(flow[28])
            
            flow_dns.append(flow[26])

            flowFeatureSet[key].append(flowFeatureSetCat)
            flowFeatureSet[key].append(flowFeatureSetCont)
            flowFeatureSet[key].append(flowFeatureSetBin)
            flowFeatureSet[key].append(flow_dns)
    return flowFeatureSet
  
  

In [0]:
import numpy as np
from scipy import stats
def extractFlowSetFeaturesCNN(flowSetMetadata):
    flowFeatureSet = dict()

    # # For each FlowSet get statistics
    index = 0
    key_counter = dict()
    for key in flowSetMetadata:
        # # Calculate Statistics for each 
        # # Calculate flowset features ##
        # # Feature set includes minium, maximum, mean
        # #  standard deviation, variance         ###
        index += 1
        key_counter[key] = 0 
        # # for each flow in the flowset get the feature go through all 
        # # flows in the flowset
        bi_number_of_pkts = []
        bi_size = []
        bac_number_of_pkts = []
        bac_size = []
        bac_time = []
        for_number_of_pkts = []
        for_size = []
        for_time = []
        number_of_flows = len(flowSetMetadata[key])
        intertime = []
        duration = []
        pkt_times = []
        protocols = []
        previous_time = 0
        flow_src_ports = []
        flow_dest_ports = []
        pkt_sizes = []
        for_pkt_sizes = []
        bac_pkt_sizes = []
        psh_flag_count = []
        ack_flag_count = []
        syn_flag_count = []
        fin_flag_count = []
        for_psh_flag_count = []
        for_ack_flag_count = []
        for_syn_flag_count = []
        for_fin_flag_count = []
        bac_psh_flag_count = []
        bac_ack_flag_count = []
        bac_syn_flag_count = []
        bac_fin_flag_count = []
        dns =[]
        subnet1 = []
        subnet2 = []
        directions = []
        pkt_dir_sizes =[]
        for flow in flowSetMetadata[key]:
            # # flow[0]= dp
            # # flow[1] = sp
            # # flow[2] = num_pkts_in
            # # flow[3] = num_pkts_out
            # # flow[4] = bytes_in
            # # flow[5] = bytes_out
            # # flow[6] = time_end
            # # flow[7] = bi_times
            # # flow[8] = bac_times
            # # flow[9] = for_times
            # # flow[10] = pkt_sizes
            # # flow[11] = for_pkt_size
            # # flow[12] = bac_pkt_size
            # # flow[13] = protocol
            # # flow[14] = psh_flag_count
            # # flow[15] = ack_flag_count
            # # flow[16] = syn_flag_count
            # # flow[17] = fin_flag_count
            # # flow[18] = for_psh_flag_count
            # # flow[19] = for_ack_flag_count
            # # flow[20] = for_syn_flag_count
            # # flow[21] = for_fin_flag_count
            # # flow[22] = bac_psh_flag_count
            # # flow[23] = bac_ack_flag_count
            # # flow[24] = bac_syn_flag_count
            # # flow[25] = bac_fin_flag_count
            # # flow[29] = directions
            flow_dest_ports.append(flow[0])
            flow_src_ports.append(flow[1])

            # ##       BACKWARD      ###
            # # NUMBER ##
            # # Get num_pkts_in##
            bac_number_of_pkts.append(flow[2])

            # # SIZE ##
            # # Get bytes_in##
            bac_size.append(flow[4])

            # #  interarrival times ##
            bac_time = bac_time + flow[8]

            # # bac pkt size ##
            bac_pkt_sizes = bac_pkt_sizes + flow[12]

            # # psh_flag_count
            bac_psh_flag_count.append(flow[22]) 

            # # ack_flag_count
            bac_ack_flag_count.append(flow[23])

            # # syn_flag_count
            bac_syn_flag_count.append(flow[24])

            # # fin_flag_count
            bac_fin_flag_count.append(flow[25])

            # ##       FORWARD      ###
            # # NUMBER ##
            # # Get num_pkts_out##
            for_number_of_pkts.append(flow[3])

            # # SIZE ##
            # # Get bytes_out##
            for_size.append(flow[5])

            # #  interarrival times ##
            for_time = for_time + flow[9]

            # # for pkt size ##
            for_pkt_sizes = for_pkt_sizes + flow[11]

            # # psh_flag_count
            for_psh_flag_count.append(flow[18])

            # # ack_flag_count
            for_ack_flag_count.append(flow[19])

            # # syn_flag_count
            for_syn_flag_count.append(flow[20])

            # # fin_flag_count
            for_fin_flag_count.append(flow[21])

            # ##      BIDIRECTION         ####
            # # NUMBER ##
            bi_number_of_pkts.append(flow[2] + flow[3])

            # # SIZE ##
            bac_number_of_pkts.append(flow[4] + flow[5])

            ## INTER FLOW TIME ##
            if previous_time != 0:
                intertime.append(flow[6] - previous_time)
            previous_time = flow[6]
            duration.append(flow[6]-flow[len(flow)-1])

            ## Inter arrival times ##
            pkt_times = pkt_times + flow[7]

            ## pkt sizes ##
            pkt_sizes = pkt_sizes + flow[10]

            ## psh_flag_count
            psh_flag_count.append(flow[14])

            ## ack_flag_count
            ack_flag_count.append(flow[15])

            ## syn_flag_count
            syn_flag_count.append(flow[16])

            ## fin_flag_count
            fin_flag_count.append(flow[17])
            if flow[26] != []:
                dns.append(flow[26])
            protocols.append(flow[13])
            subnet1.append(flow[27])
            subnet2.append(flow[28])
            directions = directions +flow[29]
            pkt_dir_sizes = pkt_dir_sizes +flow[30]
        if dns == []:
            dns.append("")
        # # Calculate flowset features ##
        # # Feature set includes minium, maximum, mean, mean absolute devation,
        # # kurtosis, skewness, standard deviation, variance, and quantiles ###

        flowFeatureSet[key] = []
        flowFeatureSetCat = []
        flowFeatureSetCont = []
        flowFeatureSetBin = []

        # ## EXTRACT FEATURE SET FOR BACKWARD DIRECTION ##
        # # NUM PACKETS FEAUTRES ###
        getStats(bac_number_of_pkts, flowFeatureSetCont)

        # # BYTE SIZE FEATUREs ###
        getStats(bac_size, flowFeatureSetCont)

        # # DURATION FEATURES ##
        getStats(bac_time, flowFeatureSetCont)
        
        #flowFeatureSetCont.append(len(np.unique(flow_dest_ports)))
        #flowFeatureSetCont.append(stats.mode(flow_dest_ports,axis=None)[0])
        flowFeatureSetCont.append(stats.mode(flow_dest_ports,axis=None)[1].astype(int))

        # ## EXTRACT FEATURE SET FOR FORWARD DIRECTION ##
        # # NUM PACKETS FEAUTRES ###
        getStats(for_number_of_pkts, flowFeatureSetCont)

        # # BYTE SIZE FEATUREs ###
        getStats(for_size, flowFeatureSetCont)

        # # DURATION FEATURES ##
        getStats(for_time, flowFeatureSetCont)
        flowFeatureSetCont.append(stats.mode(flow_src_ports,axis=None)[1].astype(float))

        # ## EXTRACT FEATURE SET FOR BIDIRECTION ##
        # # NUM PACKETS FEAUTRES ###
        getStats(bi_number_of_pkts, flowFeatureSetCont)
        
        # # BYTE SIZE FEATUREs ###
        getStats(bi_size, flowFeatureSetCont)
        
         # # PACKT INTERARRIVAL TIMES ##
        getStats(pkt_times, flowFeatureSetCont)


        # # INTER ARRIVAL TIME SIZE FEATURES ###
        getStats(intertime, flowFeatureSetCont)

        # # DURATION FEATUREs ###
        getStats(duration, flowFeatureSetCont)

       
        # # PACKET SIZES ##
        getStats(pkt_sizes, flowFeatureSetCont)

        # # FOR PACKET SIZES ##
        getStats(for_pkt_sizes, flowFeatureSetCont)

        # # BAC PACKET SIZES ##
        getStats(bac_pkt_sizes, flowFeatureSetCont)

        # # PSH FLAG  ##
        getStats(psh_flag_count, flowFeatureSetCont)

        # # ACK FLAG  ##
        getStats(ack_flag_count, flowFeatureSetCont)

        # # SYN FLAG  ##
        getStats(syn_flag_count, flowFeatureSetCont)

        # # FIN FLAG  ##
        getStats(fin_flag_count, flowFeatureSetCont)

        # # FOR PSH FLAG  ##
        getStats(for_psh_flag_count, flowFeatureSetCont)

        # # ACK FLAG  ##
        getStats(for_ack_flag_count, flowFeatureSetCont)

        # # SYN FLAG  ##
        getStats(for_syn_flag_count, flowFeatureSetCont)

        # # FIN FLAG  ##
        getStats(for_fin_flag_count, flowFeatureSetCont)

        ## BAC PSH FLAG  ##
        getStats(bac_psh_flag_count, flowFeatureSetCont)

        # # ACK FLAG  ##
        getStats(bac_ack_flag_count, flowFeatureSetCont)

        # # SYN FLAG  ##
        getStats(bac_syn_flag_count, flowFeatureSetCont)

        # # FIN FLAG  ##
        getStats(bac_fin_flag_count, flowFeatureSetCont)

        # # Get Protocol Features  ##
        #for protocol in protocols:
        flowFeatureSetCat.append(protocols)
        ##for sub in subnet1:
        flowFeatureSetBin.append(subnet1)
        #for sub in subnet2:
        flowFeatureSetBin.append(subnet2)
        
        ## Get Port values 
        flowFeatureSetCat.append(stats.mode(flow_src_ports,axis=None)[0])
        flowFeatureSetCat.append(stats.mode(flow_dest_ports,axis=None)[0])
        
        ## Append all values for a given key  
        flowFeatureSet[key].append(flowFeatureSetCat)
        flowFeatureSet[key].append(flowFeatureSetCont)
        flowFeatureSet[key].append(flowFeatureSetBin)
        flowFeatureSet[key].append(pkt_times)
        flowFeatureSet[key].append(directions)
        flowFeatureSet[key].append(pkt_dir_sizes)
        flowFeatureSet[key].append(dns)
    return flowFeatureSet

In [0]:
import os
## Get the malware and benign files 
malware_files = []
benign_files = []
for file in os.listdir('./datasets/malware_json/'):
    filename = os.fsdecode(file)
    if filename.endswith('.json'):
        malware_files.append('./datasets/malware_json/' + filename)
    else:
        continue
for file in os.listdir('./datasets/benign_json'):
    filename = os.fsdecode(file)
    if filename.endswith('.json'):
        benign_files.append('./datasets/benign_json/' + filename)
    else:
        continue

In [0]:
import pickle
import gc
malign_flows = []
## Get the metadata for the given malign flows 
for malware_file in malware_files:
    data_parser = DataParser(malware_file)
    mal_flowSetMetadata = data_parser.getFlowSetMetadata()
    if mal_flowSetMetadata != None:
      malign_flows.append(extractFlowSetFeaturesCNN(mal_flowSetMetadata))
    del mal_flowSetMetadata
    gc.collect()
## Dump the the embedding data (Change for given configuration)
with open('./embedding/CNN_FULL/metadata_mal_encodings_CNN', 'wb') as fp:
      pickle.dump(malign_flows, fp)
      fp.close()

In [0]:
import pickle
import gc
malign_flows = []
## Get the metadata for the given malign flows 
for malware_file in malware_files:
    data_parser = DataParser(malware_file)
    mal_flowSetMetadata = data_parser.getFlowSetMetadataTimings()
    malign_flows.append(mal_flowSetMetadata)
    del mal_flowSetMetadata
    gc.collect()
## Dump the the timing data (Change for given configuration)
with open('./embedding/CNN_FULL/time_mal_encodings_CNN', 'wb') as fp:
      pickle.dump(malign_flows, fp)
      fp.close()

In [0]:
import pickle
import gc
benign_flows = []
## Get the metadata for the given benign flows 
for benign_file  in benign_files:
    data_parser = DataParser(malware_file)
    benign_flowSetMetadata = data_parser.getFlowSetMetadataTimings()
    benign_flows.append(benign_flowSetMetadata)
    del benign_flowSetMetadata
    gc.collect()
## Dump the the timing data (Change for given configuration)
with open('./embedding/CNN_FULL/time_benign_encodings_CNN', 'wb') as fp:
      pickle.dump(malign_flows, fp)
      fp.close()

In [0]:
with open('./embedding/CNN_FULL/time_mal_encodings_CNN', 'rb') as fp:
    malign_flows = pickle.load(fp)
    fp.close()
mal_flows = []

## Handles soring the flows accroding to the time in which the occured rather than 
## in the way that joy returned the flows
for file in malign_flows:
  flowsets= dict()
  for key in file:
    sorted_timings = sorted(file[key][0])
    sorted_directions = [x for _,x in sorted(zip(file[key][0],file[key][1]))]
    sorted_pkt_times = [x for _,x in sorted(zip(file[key][0],file[key][2]))]
    flowsets[key] = [sorted_timings,sorted_directions,sorted_pkt_times]
  mal_flows.append(flowsets)
with open('./embedding/CNN_FULL/time_mal_encodings_CNN', 'wb') as fp:
    pickle.dump(mal_flows, fp)
    fp.close()
  


In [0]:
with open('./embedding/CNN_FULL/time_benign_encodings_CNN', 'rb') as fp:
    benign = pickle.load(fp)
    fp.close()
mal_flows = []
## Handles soring the flows accroding to the time in which the occured rather than 
## in the way that joy returned the flows
for file in benign_flows:
  flowsets= dict()
  for key in file:
    sorted_timings = sorted(file[key][0])
    sorted_directions = [x for _,x in sorted(zip(file[key][0],file[key][1]))]
    sorted_pkt_times = [x for _,x in sorted(zip(file[key][0],file[key][2]))]
    flowsets[key] = [sorted_timings,sorted_directions,sorted_pkt_times]
  benign_flows.append(flowsets)
with open('./embedding/CNN_FULL/time_benign_encodings_CNN', 'wb') as fp:
    pickle.dump(benign_flows, fp)
    fp.close()