# This file is used to flat the JSON data

In [25]:
import json
import pandas as pd
import numpy as np
from shapely.geometry import Point
from bokeh.io import output_file, output_notebook, show
from bokeh.models import (
  GMapPlot, GMapOptions, ColumnDataSource, Circle, LogColorMapper, BasicTicker, ColorBar,
    DataRange1d, PanTool, WheelZoomTool, BoxSelectTool
)
from bokeh.models.mappers import ColorMapper, LinearColorMapper
from bokeh.palettes import Viridis5
import re
import math
import ast
from pandas.io.json import json_normalize
from flatten_json import flatten
import ProjectLookUpTable 
import socket, struct
from binascii import hexlify
import datetime
import macaddress
import os

# Read JSON data

In [26]:
def file_json_file_with_file_name(pcap_Json_file_name):
    json_data_file = open(pcap_Json_file_name, encoding='utf-8')
    file_data = json.load(json_data_file)
    json_data_file.close()
    return file_data

# type_map_for_json(json_record) map string value to int 

In [27]:
# type_map_for_json(json_record)


def IPV6_to_int(ipv6_addr):
    return int(hexlify(socket.inet_pton(socket.AF_INET6, ipv6_addr)), 16)

def IPV4_to_int(ip):
    packedIP = socket.inet_aton(ip)
    return struct.unpack("!L", packedIP)[0]

def type_map_for_json(json_record):
    if type(json_record) != dict:
        return True
    for json_key in json_record.keys():
        if type_map_for_json(json_record[json_key]):
            if json_key in ProjectLookUpTable.wire_shark_type_lookup_map:
                json_value_type = ProjectLookUpTable.wire_shark_type_lookup_map[json_key]
                json_str_value = json_record[json_key]
                # change number string to number 
                if isinstance(json_str_value, int) or isinstance(json_str_value, float):
                    json_record[json_key] = json_value_type
                elif json_value_type in ProjectLookUpTable.wire_shark_number_type_set:
                    try:
                        if '0x' in json_str_value:
                            json_record[json_key] = int(json_str_value, 0)
                        elif '.' in json_str_value:
                            json_record[json_key] = float(json_str_value)
                        else:
                            json_record[json_key] = int(json_str_value)
                    except:
                        json_record[json_key] = 0
                elif json_value_type in ProjectLookUpTable.wire_shark_ipaddr_type_set:
                    try:
                        if ':' in json_str_value:
                            json_record[json_key] = IPV6_to_int(json_str_value)
                        else:
                            json_record[json_key] = IPV4_to_int(json_str_value)
                    except:
                        json_record[json_key] = json_str_value
                elif json_value_type == 'Ethernet or other MAC address':
                    try:
                        json_record[json_key] = int(macaddress.MAC(json_str_value))
                    except:
                        json_record[json_key] = json_str_value
                # elif json_value_type == 'Date and time':
                #     json_str_value = json_str_value[0:json_str_value.index('.')+6]
                #     json_record[json_key] = datetime.datetime.strptime(json_str_value, '%b %d, %Y %H:%M:%S.%f')





In [28]:
def get_wanted_packets(local_wanted_packets, local_json_record_list):
    rv = []
    idx = 0
    for record in local_json_record_list:
        if local_wanted_packets[idx] in record['_source']['layers']:
            rv.append(record)
            idx += 1
            if idx >= len(local_wanted_packets):
                return rv
    return rv

# drop use less fields by use drop_json_fields(json_record, field_name_path_list)

In [29]:

def json_field_drop(json_record, path_list):
    if path_list[0] not in json_record:
        return
    if(len(path_list) == 1):
        del json_record[path_list.pop(0)]
    else:
        json_record = json_record[path_list.pop(0)]
        json_field_drop(json_record, path_list)

def drop_json_fields(json_record, field_name_path_list):
    for name_path_str in field_name_path_list:
        json_field_drop(json_record, name_path_str.split('->'))


In [30]:
def start_drop_field_value_mapping(local_json_records, local_json_drop_field_path_list):
    for local_json_record in local_json_records:
        local_json_record = local_json_record['_source']['layers']
        drop_json_fields(local_json_record, local_json_drop_field_path_list)
        type_map_for_json(local_json_record)


In [31]:
def flatten_json_records(json_record_list):
    rv = []
    for record in json_record_list:
        record = record['_source']['layers']
        rv.append(flatten(record))
    return rv

# Only for IcedId

In [32]:
def IcedId_processing(local_key_field, local_IcedID_cookie_check_ls, first_http_record):
    if local_key_field in first_http_record:
        cookie_str = first_http_record[local_key_field]
        cookie_field_set = set()
        for tocken in cookie_str.split(';'):
            cookie_field_set.add(tocken.split('=')[0].strip())
        counter = 0
        for key in local_IcedID_cookie_check_ls:
            if key in cookie_field_set:
                counter+=1
        if counter == len(local_IcedID_cookie_check_ls):
            first_http_record['IcedIdCookie'] = 1
        else:
            first_http_record['IcedIdCookie'] = 0
    else:
        first_http_record['IcedIdCookie'] = 0


# Create the string to integer map so the ML process can run faster 


In [33]:

def map_str_to_int(local_record_str_to_int_map, local_flatten_json_records_list):
    for flatten_json_record in local_flatten_json_records_list:
        for field_name in flatten_json_record:
            if isinstance(flatten_json_record[field_name], str):
                if flatten_json_record[field_name] not in local_record_str_to_int_map:
                    local_record_str_to_int_map[flatten_json_record[field_name]] = len(local_record_str_to_int_map)+1
                flatten_json_record[field_name] = local_record_str_to_int_map[flatten_json_record[field_name]]



In [34]:
def get_df_header_of_each_col(local_num_record, malware_json_records_list_list, normal_json_records_list_list):
    local_col_name_on_offset = {}

    # init set
    for off_set in range(local_num_record):
        local_col_name_on_offset[off_set] = set()

    # get all unique fiels of each records on each offset
    for local_json_records_list in malware_json_records_list_list:
        for off_set in range(local_num_record):
            if off_set < len(local_json_records_list):
                for json_field in local_json_records_list[off_set]:
                    local_col_name_on_offset[off_set].add(json_field)

    # get all unique fiels of each records on each offset
    for local_json_records_list in normal_json_records_list_list:
        for off_set in range(local_num_record):
            if off_set < len(local_json_records_list):
                for json_field in local_json_records_list[off_set]:
                    local_col_name_on_offset[off_set].add(json_field)

    return local_col_name_on_offset

In [35]:
 # Cast the list of json records into one record and load into panda
def make_list_json_record_into_one_record(local_col_name_on_offset, local_json_records_list_list, local_num_record):

    # use the offset and unique fields name to build df
    local_overall_field_list = []
    for off_set in range(local_num_record):
        for field_name in local_col_name_on_offset[off_set]:
            local_overall_field_list.append(field_name+'_'+str(off_set))
    local_df = pd.DataFrame(columns=local_overall_field_list)
    try:
        # load data into the df
        for local_json_records_list in local_json_records_list_list:
            local_record_array = []
            for off_set in range(local_num_record):
                if off_set < len(local_json_records_list):
                    local_json_record = local_json_records_list[off_set]
                    for df_col_field in local_col_name_on_offset[off_set]:
                        if df_col_field in local_json_record:
                            local_record_array.append(local_json_record[df_col_field])
                        else:
                            local_record_array.append(0)
            local_df.loc[len(local_df.index)] = local_record_array
    except:
        print(local_record_array)
    return local_df



# Main


In [36]:
# important variables 
json_records_list_list = []
file_name_lists = []

for top_file_path in os.listdir("../../data/IcedId"):
    local_path = '../../data/IcedId/'+top_file_path + '/only/only.json'
    if os.path.exists(local_path):
        file_name_lists.append(local_path)

wanted_packets = ['http', 'tls', 'tls', 'tls', 'tls']
json_drop_field_path_list = ['frame->frame.time', 'frame->frame.time_delta_displayed', 
    'frame->frame.coloring_rule.name', 'frame->frame.coloring_rule.string',
    'ip->ip.addr', 'ip->ip.host', 'tcp->tcp.port','http->http.request.line',
    'tcp->tcp.payload', 
    'ip->ip.src_host', 'ip->ip.dst_host', 'http->http.cookie_tree',  
    'tls->tls.record->tls.handshake->tls.handshake.random',
    'tls->tls.record->tls.handshake->tls.handshake.random_tree',
    'tls->tls.record->tls.handshake.ciphersuites',
]
IcedID_cookie_check_ls = {'__gads', '_gat', '_ga', '_u', '__io', '_gid'}
record_str_to_int_map = {}

In [37]:
file_name_lists

['../../data/IcedId/2021-02-25 - TA551 (SHATHAK) BACK TO PUSHING ICEDID (BOKBOT)/only/only.json',
 '../../data/IcedId/2021-04-29 (THURSDAY) - TA551 (SHATHAK) PUSHES ICEDID (BOKBOT)/only/only.json',
 '../../data/IcedId/2021-05-24 (MONDAY) - TA551 (SHATHAK) WORD DOCS PUSH ICEDID (BOKBOT)/only/only.json',
 '../../data/IcedId/2021-06-02 (WEDNESDAY) - TA551 (SHATHAK) WORD DOCS PUSH ICEDID (BOKBOT)/only/only.json',
 '../../data/IcedId/2021-12-10 (FRIDAY) - TA551 (SHATHAK) ICEDID (BOKBOT) WITH COBALT STRIKE AND DARK VNC/only/only.json',
 '../../data/IcedId/2022-01-05 (WEDNESDAY) - TA551 (SHATHAK) PUSHES ICEDID (BOKBOT) WITH COBALT STRIKE/only/only.json',
 '../../data/IcedId/2022-01-06 (THURSDAY) - TA551 (SHATHAK) PUSHES ICEDID (BOKBOT)/only/only.json',
 '../../data/IcedId/2022-01-12 (WEDNESDAY) - ICEDID (BOKBOT) WITH COBALT STRIKE AND DARKVNC/only/only.json',
 '../../data/IcedId/2022-05-10 (TUESDAY) - TA578 CONTACT FORMS CAMPAIGN -- ICEDID (BOKBOT) -- COBALT STRIKE/only/only.json',
 '../../da

In [38]:

for  pcap_json_file_name in file_name_lists: 
    # Read Json files
    json_record_list = file_json_file_with_file_name(pcap_json_file_name)
    # make sure the start
    json_record_list = get_wanted_packets(wanted_packets, json_record_list)
    # Drop not wanted fields 
    start_drop_field_value_mapping(json_record_list,json_drop_field_path_list)
    # Flate Json into field value map
    flatten_json_records_list = flatten_json_records(json_record_list)
    
    # For IcedId, take the cookie and make them into fields 
    IcedId_processing("http_http.cookie", IcedID_cookie_check_ls, flatten_json_records_list[0])

    # Map String into int
    map_str_to_int(record_str_to_int_map, flatten_json_records_list)
    # save current file to list
    json_records_list_list.append(flatten_json_records_list)


# Read and output the normal traffic 

In [39]:
def get_normal_wanted_packets( local_wanted_packets, local_json_list):
    local_valid_record_List_group = []
    for local_json_list_idx in range(len(local_json_list)):
        if local_json_list_idx+len(local_wanted_packets)-1 >= len(local_json_list):
            break
        json_record = local_json_list[local_json_list_idx]
        if local_wanted_packets[0] in json_record['_source']['layers'] and local_wanted_packets[0] not in local_json_list[local_json_list_idx-1]['_source']['layers']:
            local_valid_record_List_group.append(get_wanted_packets(wanted_packets, local_json_list[local_json_list_idx:]))
    return local_valid_record_List_group


In [40]:
normal_json_records_list_list = []

normal_file_name_lists = []
for top_file_path in os.listdir("../../data/IcedId_normal"):
    if top_file_path.endswith(".json"):
        local_path = '../../data/IcedId_normal/'+top_file_path
        normal_file_name_lists.append(local_path)

normal_file_name_lists

['../../data/IcedId_normal/only1.json',
 '../../data/IcedId_normal/only10.json',
 '../../data/IcedId_normal/only2.json',
 '../../data/IcedId_normal/only3.json',
 '../../data/IcedId_normal/only4.json',
 '../../data/IcedId_normal/only5.json',
 '../../data/IcedId_normal/only6.json',
 '../../data/IcedId_normal/only7.json',
 '../../data/IcedId_normal/only8.json',
 '../../data/IcedId_normal/only9.json']

In [41]:
for  pcap_json_file_name in normal_file_name_lists: 
    # Read Json files
    json_record_list = file_json_file_with_file_name(pcap_json_file_name)

    # make sure the start
    json_record_list_group = get_normal_wanted_packets(wanted_packets, json_record_list)

    for json_list in json_record_list_group:
        # Drop not wanted fields 
        start_drop_field_value_mapping(json_list,json_drop_field_path_list)
        # Flate Json into field value map
        flatten_json_records_list = flatten_json_records(json_list)
        # For IcedId, take the cookie and make them into fields 
        IcedId_processing("http_http.cookie", IcedID_cookie_check_ls, flatten_json_records_list[0])
        # Map String into int
        map_str_to_int(record_str_to_int_map, flatten_json_records_list)
        # save current file to list
        normal_json_records_list_list.append(flatten_json_records_list)
    #break

# Output data table to CSV file 

In [42]:
col_name_on_offset = get_df_header_of_each_col(len(wanted_packets), json_records_list_list, []) 

In [43]:
# cast the IceId record into one record 
IcedId_df = make_list_json_record_into_one_record(col_name_on_offset, json_records_list_list, len(wanted_packets))
IcedId_df.to_csv('IcedId_only_data_set.csv', index=False)
IcedId_df

Unnamed: 0,ip_ip.len_0,eth_eth.dst_tree_eth.addr_resolved_0,tcp_tcp.flags_tree_tcp.flags.res_0,eth_eth.src_tree_eth.addr_resolved_0,tcp_tcp.flags_tree_tcp.flags.urg_0,tcp_tcp.flags_tree_tcp.flags.ae_0,eth_eth.src_tree_eth.lg_0,eth_eth.dst_tree_eth.ig_0,http_GET / HTTP/1.1\r\n__ws.expert__ws.expert.message_0,eth_eth.dst_tree_eth.addr.oui_resolved_0,...,tls_tls.record_tls.handshake_Extension: signature_algorithms (len=20)_tls.handshake.sig_hash_algs_tls.handshake.sig_hash_alg_tree_tls.handshake.sig_hash_sig_4,tcp_Timestamps_tcp.time_delta_4,tcp_tcp.urgent_pointer_4,eth_eth.dst_tree_eth.addr_4,tls_tls.record_tls.handshake_Extension: server_name (len=21)_Server Name Indication extension_tls.handshake.extensions_server_name_len_4,tls_tls.record_tls.handshake_Extension: supported_groups (len=8)_tls.handshake.extensions_supported_groups_length_4,tcp_tcp.hdr_len_4,tls_tls.record_tls.handshake_Extension: post_handshake_auth (len=0)_tls.handshake.extension.type_4,tcp_tcp.window_size_scalefactor_4,tls_tls.record_tls.handshake_Extension: server_name (len=15)_Server Name Indication extension_tls.handshake.extensions_server_name_4
0,278.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,8.0,3.0,...,0.0,0.0,0.0,36168640000000.0,0.0,0.0,20.0,0.0,-1.0,0.0
1,348.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,8.0,3.0,...,0.0,0.0,0.0,36168640000000.0,16.0,6.0,20.0,0.0,-1.0,0.0
2,345.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,8.0,3.0,...,0.0,0.0,0.0,36168640000000.0,16.0,6.0,20.0,0.0,-1.0,0.0
3,320.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,8.0,3.0,...,0.0,0.0,0.0,36168640000000.0,16.0,6.0,20.0,0.0,-1.0,0.0
4,346.0,43.0,0.0,45.0,0.0,0.0,0.0,0.0,8.0,44.0,...,0.0,0.0,0.0,228076100000.0,0.0,6.0,20.0,0.0,-1.0,0.0
5,339.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,8.0,3.0,...,0.0,0.0,0.0,36168640000000.0,0.0,6.0,20.0,0.0,-1.0,54.0
6,343.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,8.0,3.0,...,0.0,0.0,0.0,36168640000000.0,0.0,6.0,20.0,0.0,-1.0,0.0
7,336.0,64.0,0.0,65.0,0.0,0.0,0.0,0.0,8.0,44.0,...,0.0,0.0,0.0,13268890000000.0,0.0,6.0,20.0,0.0,-1.0,0.0
8,347.0,71.0,0.0,72.0,0.0,0.0,0.0,0.0,8.0,44.0,...,0.0,0.0,0.0,5418587000.0,16.0,6.0,20.0,0.0,-1.0,0.0
9,343.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,8.0,3.0,...,0.0,0.0,0.0,36168640000000.0,0.0,6.0,20.0,0.0,-1.0,0.0


In [44]:
# cast the normal record into one record 
normal_df = make_list_json_record_into_one_record(col_name_on_offset, normal_json_records_list_list, len(wanted_packets))
normal_df.to_csv('normal_IcedId_only_data_set.csv', index=False)
normal_df

[152, 111, 0, 113, 0, 0, 0, 0, 0, 112, 0, 337, 0, 166, 6, 0, 1, 2048, 0, 7621928, 2, 0, 3900358604, 166, 114, 11808409, 1021, 0, 1, 1, 1, 0, 0, 448, 0, 112, 20, 112, 0, 80, 0, 0, 19379, 64545, 0, 0, 0, 112, 7621928, 0, 114, 1114945811, 0, 0, 1666988174.225725, 198112232540353, 0, 0, 0, 0, 0, 7, 2, 127874741723750, 0, 0, 3232237710, 113, 24, 117, 51804, 4, 128, 1, 382, 196.224102, 2, 0.0, 0, 0.571264, 11808409, 0, 180.068648, 0, 0, 120, 198112232540353, 1021, 1, 111, 449, 6, 0, 0, 1, 93, 8, 2398856037, 60.01891, 0, 127874741723750, 4, 20, -1, 0, 161, 162, 111, 0, 158, 0, 0, 113, 0, 0, 0, 158, 0, 158, 158, 158, 112, 158, 157, 158, 0, 157, 162, 162, 158, 5546, 0, 0, 5583, 162, 0, 0, 0, 0, 161, 158, 0, 0, 0, 161, 162, 160, 0, 0, 0, 161, 162, 158, 161, 0, 0, 161, 157, 157, 0, 162, 0, 0, 161, 161, 0, 114, 0, 160, 157, 158, 18, 157, 158, 158, 0, 0, 158, 157, 0, 0, 157, 0, 0, 0, 0, 0, 162, 0, 112, 158, 0, 161, 0, 161, 0, 5545, 161, 161, 158, 0, 161, 0, 0, 161, 0, 157, 160, 0, 0, 0, 114, 157, 0

Unnamed: 0,ip_ip.len_0,eth_eth.dst_tree_eth.addr_resolved_0,tcp_tcp.flags_tree_tcp.flags.res_0,eth_eth.src_tree_eth.addr_resolved_0,tcp_tcp.flags_tree_tcp.flags.urg_0,tcp_tcp.flags_tree_tcp.flags.ae_0,eth_eth.src_tree_eth.lg_0,eth_eth.dst_tree_eth.ig_0,http_GET / HTTP/1.1\r\n__ws.expert__ws.expert.message_0,eth_eth.dst_tree_eth.addr.oui_resolved_0,...,tls_tls.record_tls.handshake_Extension: signature_algorithms (len=20)_tls.handshake.sig_hash_algs_tls.handshake.sig_hash_alg_tree_tls.handshake.sig_hash_sig_4,tcp_Timestamps_tcp.time_delta_4,tcp_tcp.urgent_pointer_4,eth_eth.dst_tree_eth.addr_4,tls_tls.record_tls.handshake_Extension: server_name (len=21)_Server Name Indication extension_tls.handshake.extensions_server_name_len_4,tls_tls.record_tls.handshake_Extension: supported_groups (len=8)_tls.handshake.extensions_supported_groups_length_4,tcp_tcp.hdr_len_4,tls_tls.record_tls.handshake_Extension: post_handshake_auth (len=0)_tls.handshake.extension.type_4,tcp_tcp.window_size_scalefactor_4,tls_tls.record_tls.handshake_Extension: server_name (len=15)_Server Name Indication extension_tls.handshake.extensions_server_name_4
0,152.0,111.0,0.0,113.0,0.0,0.0,0.0,0.0,0.0,112.0,...,0.0,0.0,0.0,1.278747e+14,0.0,0.0,20.0,0.0,-1.0,0.0
1,289.0,111.0,0.0,113.0,0.0,0.0,0.0,0.0,0.0,112.0,...,0.0,0.0,0.0,1.278747e+14,0.0,6.0,20.0,0.0,-1.0,0.0
2,289.0,111.0,0.0,113.0,0.0,0.0,0.0,0.0,0.0,112.0,...,0.0,156.0,161.0,1.590000e+02,0.0,161.0,162.0,0.0,164.0,0.0
3,405.0,111.0,0.0,113.0,0.0,0.0,0.0,0.0,0.0,112.0,...,0.0,0.0,0.0,1.278747e+14,0.0,0.0,20.0,0.0,-1.0,0.0
4,152.0,111.0,0.0,113.0,0.0,0.0,0.0,0.0,0.0,112.0,...,0.0,0.0,0.0,1.278747e+14,0.0,0.0,20.0,0.0,-1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,253.0,111.0,0.0,113.0,0.0,0.0,0.0,0.0,0.0,112.0,...,0.0,0.0,0.0,1.278747e+14,0.0,6.0,20.0,0.0,-1.0,0.0
403,152.0,111.0,0.0,113.0,0.0,0.0,0.0,0.0,0.0,112.0,...,0.0,0.0,0.0,1.278747e+14,0.0,0.0,20.0,0.0,-1.0,0.0
404,152.0,111.0,0.0,113.0,0.0,0.0,0.0,0.0,0.0,112.0,...,0.0,0.0,0.0,1.278747e+14,0.0,0.0,20.0,0.0,-1.0,5558.0
405,152.0,111.0,0.0,113.0,0.0,0.0,0.0,0.0,0.0,112.0,...,0.0,0.0,0.0,1.278747e+14,0.0,0.0,20.0,0.0,-1.0,0.0


In [45]:
# output flated data and str map to files
with open('IcedId_record_str_to_int_map.json', 'w') as fp:
    json.dump(record_str_to_int_map, fp)

# Free and testing space, do not run the following 

In [None]:
wire_shark_type = set()
for key,val in ProjectLookUpTable.wire_shark_type_lookup_map.items():
    wire_shark_type.add(val)
wire_shark_type

In [None]:

wire_shark_type_lookup_map = {}
wire_shark_description_lookup_map = {}
for i in range(len(field_list)):
    wire_shark_type_lookup_map[field_list[i]] = type_list[i]
    wire_shark_description_lookup_map[field_list[i]] = description_list[i]
