# Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import json # original json library

import pandas as pd
import logging

## Log set-up

In [2]:
LOG_FILENAME = '/home/benjamin/Folders_Python/Cyber/logs/logfile.log'
LOG_FORMAT = '%(asctime)% -- %(name)s -- %(levelname)s -- %(message)s'
# LOG_LEVEL = logging.INFO

In [3]:
# specific logger for the module
logger = logging.getLogger(__name__)   # creates specific logger for the module
logger.setLevel(logging.DEBUG)    # entry level of messages from all handlers
LOG_FORMAT = '%(asctime)s -- %(name)s -- %(levelname)s -- %(message)s'
formatter = logging.Formatter(LOG_FORMAT)

# file handler to log everything
file_handler = logging.FileHandler(LOG_FILENAME, mode='w')
file_handler.setLevel(logging.INFO)  # all messages (DEBUG and up) get logged in the file
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# stream handler to show messages to the console
console = logging.StreamHandler()
console.setLevel(logging.WARNING)  # Warning messages and up get displayed to the console
console.setFormatter(formatter)
logger.addHandler(console)

# Import pcap file

In [4]:
# NB : tshark -r <file>.pcap -T json > <file_pcap>.json -t r
# commande shell qui prend un pcap et le passe en json

!rm /home/benjamin/Folders_Python/Cyber/data/outputs/exemple_pcap.json
!tshark -r /home/benjamin/Folders_Python/Cyber/data/input_pcaps/input.pcap -T json -t r > /home/benjamin/Folders_Python/Cyber/data/outputs/exemple_pcap.json

logger.info("run tshark from input.pcap to creat json")

In [5]:
class Packet():
    """Utility self-made unperfect class to parse the json object and extract features from a packet-like dict
    """
    
    def __init__(self, raw_packet:dict) -> None:
        self.raw_packet = raw_packet
        self._packet_data = None
        logger.debug('constructor of Packet instance has finished')
        
    @property
    def packet_data(self):
        # returns the full dictionnary of features
        if self._packet_data is not None:
            return self._packet_data
        else:
            sl = self.raw_packet.get('_source').get('layers')
            slf = sl.get('frame')
            sle = sl.get('eth')
            sli = sl.get('ip', {})  # return empty dict as default not found value so it can handle another get method
            slu = sl.get('udp', {})
            slt = sl.get('tcp', {})
                                               
            self._packet_data = {
                'frame_time' : slf.get('frame.time'),
                'frame_time_relative' : slf.get('frame.time_relative'),
                'frame_length' : slf.get("frame.len"),
                'frame_protocols' : slf.get("frame.protocols"),
                'eth_source': sle.get("eth.src"),
                'eth_dest': sle.get("eth.dst") ,
                'ip_version': sli.get("ip.version"),
                'ip_header_length': sli.get("ip.hdr_len"),
                'ip_length': sli.get("ip.len"),
                'ip_id': sli.get("ip.id"),
                'ip_flags': sli.get("ip.flags"),
                'ip_ttl': sli.get("ip.ttl"),
                'ip_proto': sli.get("ip.proto"),
                'ip_source': sli.get("ip.src"),
                'ip_dest': sli.get("ip.dst"),
                'udp_source_port': slu.get("udp.srcport"),
                'udp_dest_port': slu.get("udp.port"),
                'udp_length': slu.get("udp.length"),
                'tcp_source_port': slt.get("tcp.srcport"),
                'tcp_dest_port': slt.get("tcp.dstport"),
                'tcp_length': slt.get("tcp.len"),
                'tcp_flags': slt.get("tcp.flags"),
            }
            logger.debug('packet_data @property method has finished')
            return self._packet_data
        
    @packet_data.setter
    def packet_data(self, input):
        """illegal attempt to write packet_data"""
        logger.warning('Illegal attempt to write a data_packet in a packet object')
        pass

In [6]:
PCAP_FILENAME = "/home/benjamin/Folders_Python/Cyber/data/outputs/exemple_pcap.json"

with open (PCAP_FILENAME) as raw_packets:
    json_object = json.load(raw_packets)    # load le fichier json dans une structure Python (list of dicts)

In [7]:
#Exemple : premier dict de la liste : c'est un paquet (=une frame Ethernet)

json_object[0]

{'_index': 'packets-2023-06-13',
 '_type': 'doc',
 '_score': None,
 '_source': {'layers': {'frame': {'frame.encap_type': '1',
    'frame.time': 'Jun 13, 2023 18:58:42.019861000 CEST',
    'frame.offset_shift': '0.000000000',
    'frame.time_epoch': '1686675522.019861000',
    'frame.time_delta': '0.000000000',
    'frame.time_delta_displayed': '0.000000000',
    'frame.time_relative': '0.000000000',
    'frame.number': '1',
    'frame.len': '84',
    'frame.cap_len': '84',
    'frame.marked': '0',
    'frame.ignored': '0',
    'frame.protocols': 'eth:ethertype:ip:udp:dns'},
   'eth': {'eth.dst': '9c:9d:7e:91:92:4b',
    'eth.dst_tree': {'eth.dst_resolved': '9c:9d:7e:91:92:4b',
     'eth.dst.oui': '10263934',
     'eth.dst.oui_resolved': 'Beijing Xiaomi Mobile Software Co., Ltd',
     'eth.addr': '9c:9d:7e:91:92:4b',
     'eth.addr_resolved': '9c:9d:7e:91:92:4b',
     'eth.addr.oui': '10263934',
     'eth.addr.oui_resolved': 'Beijing Xiaomi Mobile Software Co., Ltd',
     'eth.dst.lg': 

In [8]:
# exemple d'instanciation d'un objet Packet
p = Packet(json_object[0])

p.packet_data

{'frame_time': 'Jun 13, 2023 18:58:42.019861000 CEST',
 'frame_time_relative': '0.000000000',
 'frame_length': '84',
 'frame_protocols': 'eth:ethertype:ip:udp:dns',
 'eth_source': '6c:88:14:eb:a4:5c',
 'eth_dest': '9c:9d:7e:91:92:4b',
 'ip_version': '4',
 'ip_header_length': '20',
 'ip_length': '70',
 'ip_id': '0x0000f987',
 'ip_flags': '0x00000040',
 'ip_ttl': '64',
 'ip_proto': '17',
 'ip_source': '192.168.31.236',
 'ip_dest': '192.168.31.1',
 'udp_source_port': '40271',
 'udp_dest_port': '53',
 'udp_length': '50',
 'tcp_source_port': None,
 'tcp_dest_port': None,
 'tcp_length': None,
 'tcp_flags': None}

# Produce DataFrame for Raw Packets analysis

In [9]:
# créé la liste de dictionnaires des data des objets Packets
packets = [ Packet(d).packet_data for d in json_object ]

In [10]:
df_packets = pd.DataFrame(packets)

In [11]:
df_packets.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq
frame_time,2293,2258,"Jun 13, 2023 18:58:58.299417000 CEST",11
frame_time_relative,2293,2258,16.279556000,11
frame_length,2293,408,66,712
frame_protocols,2293,18,eth:ethertype:ip:tcp,970
eth_source,2293,3,9c:9d:7e:91:92:4b,1279
eth_dest,2293,7,6c:88:14:eb:a4:5c,1278
ip_version,2282,1,4,2282
ip_header_length,2282,2,20,2280
ip_length,2282,407,52,712
ip_id,2282,1939,0x00000000,252


# EVE JSON Output by Suricata

In [12]:
# run Suricata to produce an eve.json file with alerts

!rm /home/benjamin/Folders_Python/Cyber/data/outputs/eve.json
!suricata -r /home/benjamin/Folders_Python/Cyber/data/input_pcaps/input.pcap -l /home/benjamin/Folders_Python/Cyber/data/outputs

logger.info("run Suricata to reassemble flows and create alert logs")

[32m22/6/2023 -- 18:25:00[0m - <[33mInfo[0m> - Configuration node 'af-packet' redefined.[0m
[32m22/6/2023 -- 18:25:00[0m - <[1;33mNotice[0m> - [33mThis is Suricata version 6.0.1 RELEASE running in USER mode[0m
[32m22/6/2023 -- 18:26:11[0m - <[1;33mNotice[0m> - [33mall 5 packet processing threads, 4 management threads initialized, engine started.[0m
[32m22/6/2023 -- 18:26:11[0m - <[1;33mNotice[0m> - [33mSignal Received.  Stopping engine.[0m
[32m22/6/2023 -- 18:26:11[0m - <[1;33mNotice[0m> - [33mPcap-file module read 1 files, 2293 packets, 1017282 bytes[0m


In [13]:
# Pandas provides a useful method – json_normalize – for normalizing nested JSON fields into dataframe. Resulting columns use dot notation to signify nested objects, similar to how Elasticsearch does it

SURICATA_EVE_LOG = "/home/benjamin/Folders_Python/Cyber/data/outputs/eve.json"

with open (SURICATA_EVE_LOG) as packets:
    df = pd.json_normalize(
        [json.loads(packet) for packet in packets],
        max_level=1
    )

In [14]:
df

Unnamed: 0,timestamp,flow_id,pcap_cnt,event_type,src_ip,src_port,dest_ip,dest_port,proto,community_id,...,stats.uptime,stats.decoder,stats.flow,stats.defrag,stats.flow_bypassed,stats.tcp,stats.detect,stats.app_layer,stats.http,stats.ftp
0,2023-06-13T18:58:42.101468+0200,1.396473e+15,9.0,dns,192.168.31.236,34027.0,192.168.31.1,53.0,UDP,1:VGOAqxAYiYx9ZyuiuCZSzAC13hM=,...,,,,,,,,,,
1,2023-06-13T18:58:42.019861+0200,1.767949e+15,1.0,dns,192.168.31.236,40271.0,192.168.31.1,53.0,UDP,1:YRxFxmGj80Z8ohk84hjQWzT6mR8=,...,,,,,,,,,,
2,2023-06-13T18:58:42.101498+0200,1.396473e+15,10.0,dns,192.168.31.236,34027.0,192.168.31.1,53.0,UDP,1:VGOAqxAYiYx9ZyuiuCZSzAC13hM=,...,,,,,,,,,,
3,2023-06-13T18:58:42.019888+0200,1.767949e+15,2.0,dns,192.168.31.236,40271.0,192.168.31.1,53.0,UDP,1:YRxFxmGj80Z8ohk84hjQWzT6mR8=,...,,,,,,,,,,
4,2023-06-13T18:58:42.103914+0200,1.396473e+15,11.0,dns,192.168.31.236,34027.0,192.168.31.1,53.0,UDP,1:VGOAqxAYiYx9ZyuiuCZSzAC13hM=,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,2023-06-13T18:58:42.019861+0200,4.156138e+14,,flow,192.168.31.236,47177.0,192.168.31.1,53.0,UDP,1:4hyfPkdxRpInWoa8nTBQszsGeK0=,...,,,,,,,,,,
292,2023-06-13T18:58:42.019861+0200,2.106040e+15,,flow,192.168.31.236,40993.0,192.168.31.1,53.0,UDP,1:MI1pFoL5hTO9KkZKyhi16NZ52TU=,...,,,,,,,,,,
293,2023-06-13T18:58:42.019861+0200,5.594136e+14,,flow,192.168.31.236,57160.0,192.168.31.1,53.0,UDP,1:Q7xwq+PvDkXDHAPTtQ0Qz8OOwuc=,...,,,,,,,,,,
294,2023-06-13T18:58:42.019861+0200,1.372441e+14,,flow,192.168.31.236,39154.0,142.250.179.67,80.0,TCP,1:tJEFXE7vv3VmxgnxnMVASKcITws=,...,,,,,,,,,,


In [15]:
df[df['event_type']=='flow']

Unnamed: 0,timestamp,flow_id,pcap_cnt,event_type,src_ip,src_port,dest_ip,dest_port,proto,community_id,...,stats.uptime,stats.decoder,stats.flow,stats.defrag,stats.flow_bypassed,stats.tcp,stats.detect,stats.app_layer,stats.http,stats.ftp
219,2023-06-13T18:58:42.019861+0200,1.894178e+15,,flow,192.168.31.236,55418.0,172.217.20.202,443.0,TCP,1:PEhm84G/YMXxCh4/kAgR41kzmFQ=,...,,,,,,,,,,
220,2023-06-13T18:58:42.019861+0200,1.581692e+14,,flow,192.168.31.236,43722.0,35.174.110.81,443.0,TCP,1:pjczzSlfvHDxkdRFdIJJkaofAXI=,...,,,,,,,,,,
221,2023-06-13T18:58:42.019861+0200,1.759062e+15,,flow,192.168.31.236,35700.0,34.160.144.191,443.0,TCP,1:xrzzfBsCDEGUmBQdG0SH0ePYpos=,...,,,,,,,,,,
222,2023-06-13T18:58:42.019861+0200,2.144390e+15,,flow,192.168.31.236,54220.0,34.250.170.156,443.0,TCP,1:FXnLyP461w9j2mjBqY6QMoRP9a8=,...,,,,,,,,,,
223,2023-06-13T18:58:42.019861+0200,1.127964e+15,,flow,192.168.31.236,36798.0,142.250.74.227,443.0,UDP,1:+A7ef/4X6SmVn7YvAoOfZTXqslA=,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,2023-06-13T18:58:42.019861+0200,2.241529e+15,,flow,192.168.31.236,53550.0,185.60.219.35,443.0,UDP,1:32StlF8z0yrnbE+eA2zbpOQLSlM=,...,,,,,,,,,,
291,2023-06-13T18:58:42.019861+0200,4.156138e+14,,flow,192.168.31.236,47177.0,192.168.31.1,53.0,UDP,1:4hyfPkdxRpInWoa8nTBQszsGeK0=,...,,,,,,,,,,
292,2023-06-13T18:58:42.019861+0200,2.106040e+15,,flow,192.168.31.236,40993.0,192.168.31.1,53.0,UDP,1:MI1pFoL5hTO9KkZKyhi16NZ52TU=,...,,,,,,,,,,
293,2023-06-13T18:58:42.019861+0200,5.594136e+14,,flow,192.168.31.236,57160.0,192.168.31.1,53.0,UDP,1:Q7xwq+PvDkXDHAPTtQ0Qz8OOwuc=,...,,,,,,,,,,


## Produce DatFrame for Flow Analysis

In [16]:
# from Suricata doc :

# 15.1.2.12. Event type: Flow
# 15.1.2.12.1. Fields

#     “pkts_toserver”: total number of packets to server, include bypassed packets
#     “pkts_toclient”: total number of packets to client
#     “bytes_toserver”: total bytes count to server
#     “bytes_toclient”: total bytes count to client
#     “bypassed.pkts_toserver”: number of bypassed packets to server
#     “bypassed.pkts_toclient”: number of bypassed packets to client
#     “bypassed.bytes_toserver”: bypassed bytes count to server
#     “bypassed.bytes_toclient”: bypassed bytes count to client
#     “start”: date of start of the flow
#     “end”: date of end of flow (last seen packet)
#     “age”: duration of the flow
#     “bypass”: if the flow has been bypassed, it is set to “local” (internal bypass) or “capture”
#     “state”: display state of the flow (include “new”, “established”, “closed”, “bypassed”)
#     “reason”: mechanism that did trigger the end of the flow (include “timeout”, “forced” and “shutdown”)
#     “alerted”: “true” or “false” depending if an alert has been seen on flow

In [17]:
# https://www.stamus-networks.com/blog/jupyter-playbooks-for-suricata-part-1

# https://malware-traffic-analysis.net/

In [18]:
class Flow():
    """Utility class - takes a event-flow string out of eve.json,
       creates a one-level dict structure, suitable for dataframe creation
    """
    def __init__(self, flow_event:dict):
        if flow_event.get('event_type') != 'flow':
            logger.critical("Attempt to build a Flow instance with a non-flow event")
            raise ValueError
        self._raw_flow_event = flow_event
        self._features = None
        
    @property
    def features(self):
        if self._features is not None:
            return self._features
        else:
            keys_list_first_level = [
                'timestamp',
                'flow_id',
                'src_ip',
                'src_port',
                'dest_ip',
                'dest_port',
                'proto'
            ]
            keys_list_second_level = [
                'pkts_toserver',
                'pkts_toclient',
                'bytes_toserver',
                'bytes_toclient',
                'start',
                'end',
                'age',
                'state',
                'reason',
                'alerted'
            ]
            d1 = { k: self._raw_flow_event.get(k) for k in keys_list_first_level }
            d2 = { k: self._raw_flow_event.get('flow').get(k) for k in keys_list_second_level }
            self._features = { **d1, **d2 }
            logger.info("built a Flow features object")
            return self._features
        
    @features.setter
    def features(self, input):
        logger.critical("illegal attempt to hard write features in a Flow object")
        
    def __str__(self) -> str:
        return json.dumps(self.features, indent=4)
    
    def __repr__(self) -> str:
        return json.dumps(self.features, indent=4)


In [19]:
# exemple de flow JSON = 
# {
# "timestamp":"2023-06-17T10:46:05.765744+0200",
# "flow_id":860724109937755,
# "event_type":"flow",
# "src_ip":"2a01:cb19:872e:3000:0e4f:3187:540c:d66c",
# "src_port":47864,
# "dest_ip":"2a00:1450:4007:081a:0000:0000:0000:2003",
# "dest_port":80,
# "proto":"TCP",
# "flow":
#     {"pkts_toserver":6,
#     "pkts_toclient":5,
#     "bytes_toserver":516,
#     "bytes_toclient":430,
#     "start":"2023-06-17T10:46:10.625755+0200",
#     "end":"2023-06-17T10:46:44.150502+0200",
#     "age":34,
#     "state":"new",
#     "reason":"shutdown",
#     "alerted":true},
# "community_id":"1:uRhWV544zvWeIohZCmryZHXZ5EA=",
# "tcp":
#     {"tcp_flags":"00",
#     "tcp_flags_ts":"00",
#     "tcp_flags_tc":"00"
#     }
# }'

In [20]:
SURICATA_EVE_LOG = "/home/benjamin/Folders_Python/Cyber/data/outputs/eve.json"

i=0
columns_names =  [
                'timestamp',
                'flow_id',
                'src_ip',
                'src_port',
                'dest_ip',
                'dest_port',
                'proto'
            ] + [
                'pkts_toserver',
                'pkts_toclient',
                'bytes_toserver',
                'bytes_toclient',
                'start',
                'end',
                'age',
                'state',
                'reason',
                'alerted'
            ]
dict_for_dataframe = { k:[] for k in columns_names }

with open (SURICATA_EVE_LOG) as f:
    for event_string in f:
        python_object = json.loads(event_string)
        if python_object.get('event_type')=='flow':
            flow = Flow(python_object)
            for k in columns_names:
                if dict_for_dataframe[k] == []:
                    dict_for_dataframe[k] = [flow.features.get(k)]
                else:
                    dict_for_dataframe[k].append(flow.features.get(k))

In [21]:
df_flow = pd.DataFrame(data=dict_for_dataframe)


In [22]:
df_flow

Unnamed: 0,timestamp,flow_id,src_ip,src_port,dest_ip,dest_port,proto,pkts_toserver,pkts_toclient,bytes_toserver,bytes_toclient,start,end,age,state,reason,alerted
0,2023-06-13T18:58:42.019861+0200,1894177756203983,192.168.31.236,55418.0,172.217.20.202,443.0,TCP,13,11,1711,6061,2023-06-13T18:58:52.773071+0200,2023-06-13T18:59:29.774565+0200,37,closed,shutdown,False
1,2023-06-13T18:58:42.019861+0200,158169154957347,192.168.31.236,43722.0,35.174.110.81,443.0,TCP,505,711,65798,561594,2023-06-13T18:58:51.307235+0200,2023-06-13T18:59:29.848786+0200,38,closed,shutdown,False
2,2023-06-13T18:58:42.019861+0200,1759062379374233,192.168.31.236,35700.0,34.160.144.191,443.0,TCP,9,7,758,4928,2023-06-13T18:58:42.304793+0200,2023-06-13T18:58:42.356796+0200,0,closed,shutdown,False
3,2023-06-13T18:58:42.019861+0200,2144389665888326,192.168.31.236,54220.0,34.250.170.156,443.0,TCP,38,35,12872,12117,2023-06-13T18:58:51.038982+0200,2023-06-13T18:59:29.790487+0200,38,closed,shutdown,False
4,2023-06-13T18:58:42.019861+0200,1127964180934377,192.168.31.236,36798.0,142.250.74.227,443.0,UDP,12,25,4039,28879,2023-06-13T18:58:58.192233+0200,2023-06-13T18:59:06.377006+0200,8,established,shutdown,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,2023-06-13T18:58:42.019861+0200,2241528943545983,192.168.31.236,53550.0,185.60.219.35,443.0,UDP,8,10,2523,3235,2023-06-13T18:59:27.069247+0200,2023-06-13T18:59:27.179737+0200,0,established,shutdown,False
72,2023-06-13T18:58:42.019861+0200,415613789065546,192.168.31.236,47177.0,192.168.31.1,53.0,UDP,2,2,170,347,2023-06-13T18:58:42.840010+0200,2023-06-13T18:58:42.862914+0200,0,established,shutdown,False
73,2023-06-13T18:58:42.019861+0200,2106039902984594,192.168.31.236,40993.0,192.168.31.1,53.0,UDP,2,2,160,274,2023-06-13T18:58:52.842130+0200,2023-06-13T18:58:52.846715+0200,0,established,shutdown,False
74,2023-06-13T18:58:42.019861+0200,559413589067417,192.168.31.236,57160.0,192.168.31.1,53.0,UDP,2,2,170,262,2023-06-13T18:58:42.870041+0200,2023-06-13T18:58:42.872355+0200,0,established,shutdown,False
