# This file is used to flat the JSON data

In [176]:
import json
import pandas as pd
import numpy as np
from shapely.geometry import Point
from bokeh.io import output_file, output_notebook, show
from bokeh.models import (
  GMapPlot, GMapOptions, ColumnDataSource, Circle, LogColorMapper, BasicTicker, ColorBar,
    DataRange1d, PanTool, WheelZoomTool, BoxSelectTool
)
from bokeh.models.mappers import ColorMapper, LinearColorMapper
from bokeh.palettes import Viridis5
import re
import math
import ast
from pandas.io.json import json_normalize
from flatten_json import flatten
import ProjectLookUpTable 
import socket, struct
from binascii import hexlify
import datetime
import macaddress

# Read JSON data

In [177]:
def file_json_file_with_file_name(pcap_Json_file_name):
    json_data_file = open(pcap_Json_file_name, encoding='utf-8')
    file_data = json.load(json_data_file)
    json_data_file.close()
    return file_data

# type_map_for_json(json_record) map string value to int 

In [178]:
# type_map_for_json(json_record)


def IPV6_to_int(ipv6_addr):
    return int(hexlify(socket.inet_pton(socket.AF_INET6, ipv6_addr)), 16)

def IPV4_to_int(ip):
    packedIP = socket.inet_aton(ip)
    return struct.unpack("!L", packedIP)[0]

def type_map_for_json(json_record):
    if type(json_record) != dict:
        return True
    for json_key in json_record.keys():
        if type_map_for_json(json_record[json_key]):
            if json_key in ProjectLookUpTable.wire_shark_type_lookup_map:
                json_value_type = ProjectLookUpTable.wire_shark_type_lookup_map[json_key]
                json_str_value = json_record[json_key]
                # change number string to number 
                if json_value_type in ProjectLookUpTable.wire_shark_number_type_set:
                    if '0x' in json_str_value:
                        json_record[json_key] = int(json_str_value, 0)
                    elif '.' in json_str_value:
                        json_record[json_key] = float(json_str_value)
                    else:
                        json_record[json_key] = int(json_str_value)
                elif json_value_type in ProjectLookUpTable.wire_shark_ipaddr_type_set:
                    if ':' in json_str_value:
                        json_record[json_key] = IPV6_to_int(json_str_value)
                    else:
                        json_record[json_key] = IPV4_to_int(json_str_value)
                elif json_value_type == 'Ethernet or other MAC address':
                    json_record[json_key] = int(macaddress.MAC(json_str_value))
                # elif json_value_type == 'Date and time':
                #     json_str_value = json_str_value[0:json_str_value.index('.')+6]
                #     json_record[json_key] = datetime.datetime.strptime(json_str_value, '%b %d, %Y %H:%M:%S.%f')





In [179]:
def get_wanted_packets(local_wanted_packets, local_json_record_list):
    rv = []
    idx = 0
    for record in local_json_record_list:
        if local_wanted_packets[idx] in record['_source']['layers']:
            rv.append(record)
            idx += 1
            if idx >= len(local_wanted_packets):
                return rv
    return rv

# drop use less fields by use drop_json_fields(json_record, field_name_path_list)

In [180]:

def json_field_drop(json_record, path_list):
    if path_list[0] not in json_record:
        return
    if(len(path_list) == 1):
        del json_record[path_list.pop(0)]
    else:
        json_record = json_record[path_list.pop(0)]
        json_field_drop(json_record, path_list)

def drop_json_fields(json_record, field_name_path_list):
    for name_path_str in field_name_path_list:
        json_field_drop(json_record, name_path_str.split('->'))


In [181]:
def start_drop_field_value_mapping(local_json_records, local_json_drop_field_path_list):
    for local_json_record in local_json_records:
        local_json_record = local_json_record['_source']['layers']
        drop_json_fields(local_json_record, local_json_drop_field_path_list)
        type_map_for_json(local_json_record)


In [182]:
def flatten_json_records(json_record_list):
    rv = []
    for record in json_record_list:
        record = record['_source']['layers']
        rv.append(flatten(record))
    return rv

# Only for IcedId

In [183]:
def IcedId_processing(local_key_field, local_IcedID_cookie_check_ls, first_http_record):
    if local_key_field in first_http_record:
        cookie_str = first_http_record[local_key_field]
        cookie_field_set = set()
        for tocken in cookie_str.split(';'):
            cookie_field_set.add(tocken.split('=')[0].strip())
        counter = 0
        for key in local_IcedID_cookie_check_ls:
            if key in cookie_field_set:
                counter+=1
        if counter == len(local_IcedID_cookie_check_ls):
            first_http_record['IcedIdCookie'] = 1
        else:
            first_http_record['IcedIdCookie'] = 0
    else:
        first_http_record['IcedIdCookie'] = 0


# Create the string to integer map so the ML process can run faster 


In [184]:

def map_str_to_int(local_record_str_to_int_map, local_flatten_json_records_list):
    for flatten_json_record in local_flatten_json_records_list:
        for field_name in flatten_json_record:
            if isinstance(flatten_json_record[field_name], str):
                if flatten_json_record[field_name] not in local_record_str_to_int_map:
                    local_record_str_to_int_map[flatten_json_record[field_name]] = len(local_record_str_to_int_map)+1
                flatten_json_record[field_name] = local_record_str_to_int_map[flatten_json_record[field_name]]



In [185]:
 # Cast the list of json records into one record and load into panda
def make_list_json_record_into_one_record(local_json_records_list_list, local_num_record):
    local_col_name_on_offset = {}

    # init set
    for off_set in range(local_num_record):
        local_col_name_on_offset[off_set] = set()

    # get all unique fiels of each records on each offset
    for local_json_records_list in local_json_records_list_list:
        for off_set in range(local_num_record):
            if off_set < len(local_json_records_list):
                for json_field in local_json_records_list[off_set]:
                    local_col_name_on_offset[off_set].add(json_field)

    # use the offset and unique fields name to build df
    local_overall_field_list = []
    for off_set in range(local_num_record):
        for field_name in local_col_name_on_offset[off_set]:
            local_overall_field_list.append(field_name+'_'+str(off_set))
    local_df = pd.DataFrame(columns=local_overall_field_list)

    # load data into the df
    for local_json_records_list in local_json_records_list_list:
        local_record_array = []
        for off_set in range(local_num_record):
            if off_set < len(local_json_records_list):
                local_json_record = local_json_records_list[off_set]
                for df_col_field in local_col_name_on_offset[off_set]:
                    if df_col_field in local_json_record:
                        local_record_array.append(local_json_record[df_col_field])
                    else:
                        local_record_array.append(0)
        local_df.loc[len(local_df.index)] = local_record_array
    return local_df



# Main


In [186]:
# important variables 
json_records_list_list = []
file_name_lists = ["../data/2021-02-25 - TA551 (SHATHAK) BACK TO PUSHING ICEDID (BOKBOT)/only/only.json"]
wanted_packets = ['http', 'tls', 'tls', 'tls', 'tls']
json_drop_field_path_list = ['frame->frame.time', 'frame->frame.time_delta_displayed', 
    'frame->frame.coloring_rule.name', 'frame->frame.coloring_rule.string',
    'ip->ip.addr', 'ip->ip.host', 'tcp->tcp.port','http->http.request.line',
    'tcp->tcp.payload', 
    'ip->ip.src_host', 'ip->ip.dst_host', 'http->http.cookie_tree',  
    'tls->tls.record->tls.handshake->tls.handshake.random',
    'tls->tls.record->tls.handshake->tls.handshake.random_tree',
    'tls->tls.record->tls.handshake.ciphersuites',
]
IcedID_cookie_check_ls = {'__gads', '_gat', '_ga', '_u', '__io', '_gid'}
record_str_to_int_map = {}

In [187]:

for  pcap_json_file_name in file_name_lists: 
    # Read Json files
    json_record_list = file_json_file_with_file_name(pcap_json_file_name)
    # make sure the start
    json_record_list = get_wanted_packets(wanted_packets, json_record_list)
    # Drop not wanted fields 
    start_drop_field_value_mapping(json_record_list,json_drop_field_path_list)
    # Flate Json into field value map
    flatten_json_records_list = flatten_json_records(json_record_list)
    
    # For IcedId, take the cookie and make them into fields 
    IcedId_processing("http_http.cookie", IcedID_cookie_check_ls, flatten_json_records_list[0])

    # Map String into int
    map_str_to_int(record_str_to_int_map, flatten_json_records_list)
    # save current file to list
    json_records_list_list.append(flatten_json_records_list)


In [188]:
# cast the list record into one record 
df = make_list_json_record_into_one_record(json_records_list_list, len(wanted_packets))
df.to_csv('IcedId_only_data_set.csv')
# output flated data and str map to files
with open('IcedId_record_str_to_int_map.json', 'w') as fp:
    json.dump(record_str_to_int_map, fp)


# TODO: map string to it matching int code, cookie spliter

# Free and testing space

In [72]:
wire_shark_type = set()
for key,val in ProjectLookUpTable.wire_shark_type_lookup_map.items():
    wire_shark_type.add(val)
wire_shark_type

{'ASN.1 object identifier',
 'Boolean',
 'Byte sequence',
 'Character string',
 'Date and time',
 'Ethernet or other MAC address',
 'Floating point (double-precision)',
 'Frame number',
 'IPv4 address',
 'IPv6 address',
 'Label',
 'Signed integer (1 byte)',
 'Signed integer (2 bytes)',
 'Signed integer (4 bytes)',
 'Signed integer (8 bytes)',
 'Time offset',
 'Unsigned integer (1 byte)',
 'Unsigned integer (2 bytes)',
 'Unsigned integer (3 bytes)',
 'Unsigned integer (4 bytes)',
 'Unsigned integer (8 bytes)',
 'Unsigned integer, 2 bytes',
 'Unsigned integer, 4 bytes'}

In [None]:

wire_shark_type_lookup_map = {}
wire_shark_description_lookup_map = {}
for i in range(len(field_list)):
    wire_shark_type_lookup_map[field_list[i]] = type_list[i]
    wire_shark_description_lookup_map[field_list[i]] = description_list[i]
