# Basic manipulations with PyShark

### Here, we use PyShark to parse through pcap packets, build dataframes and store them in Pickle format

In [15]:
# PATH change to access library

import sys
sys.path.append('/home/benjamin/Folders_Python/Cyber/libs')

# Imports

import numpy as np
import pandas as pd
import pyshark
import matplotlib.pyplot as plt
import json # original json library
import logging

# Home made library

import cyberlib as cbl

In [16]:
# logging set-up for debug

LOG_FILENAME = '/home/benjamin/Folders_Python/Cyber/logs/logfile.log'
LOG_FORMAT = '%(asctime)% -- %(name)s -- %(levelname)s -- %(message)s'
# LOG_LEVEL = logging.INFO

# specific logger for the module
logger = logging.getLogger(__name__)   # creates specific logger for the module
logger.setLevel(logging.DEBUG)    # entry level of messages from all handlers
LOG_FORMAT = '%(asctime)s -- %(name)s -- %(levelname)s -- %(message)s'
formatter = logging.Formatter(LOG_FORMAT)

# file handler to log everything
file_handler = logging.FileHandler(LOG_FILENAME, mode='w')
file_handler.setLevel(logging.DEBUG)  # all messages (DEBUG and up) get logged in the file
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# stream handler to show messages to the console
console = logging.StreamHandler()
console.setLevel(logging.WARNING)  # Warning messages and up get displayed to the console
console.setFormatter(formatter)
logger.addHandler(console)

# start your engine
logger.info("-------- new run --------")

In [17]:
PCAPFILE = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/traffic_cortex_seul_TM+TC.pcap'

logger.info(f'-- entry pcap file = {PCAPFILE} --')

In [18]:
import asyncio
import nest_asyncio

nest_asyncio.apply()  # fix the 'RunTime Error : this event loop is already running'

In [19]:
# pcap file capture
# ek is set to True : newline delimited JSON format
# https://www.wireshark.org/docs/man-pages/tshark.html

capture = pyshark.FileCapture(
    input_file=PCAPFILE,
    use_ek=True
)

pkt = capture[0]

In [20]:
# what does a PyShark packet look like

dir(pkt)

['__bool__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_packet_string',
 'captured_length',
 'eth',
 'frame_info',
 'get_multiple_layers',
 'get_raw_packet',
 'highest_layer',
 'interface_captured',
 'ip',
 'layers',
 'length',
 'number',
 'pretty_print',
 'show',
 'sniff_time',
 'sniff_timestamp',
 'tcp',
 'transport_layer']

In [21]:
# layers in the first packet

list_layers = pkt.layers

print(list_layers)

[<ETH Layer>, <IP Layer>, <TCP Layer>]


In [22]:
# printing one level below

for i,layer in enumerate(list_layers):
    fields_names = layer.field_names
    print(f'layer {i} = {fields_names}')
    for field in fields_names:
        print(f"layer {i} -- {field} = {layer.get(field)}")
    print(f'\n')

layer 0 = ['addr', 'lg', 'src', 'type', 'dst', 'ig']
layer 0 -- addr = <EkMultiField addr: 00:50:56:90:ca:b0>
layer 0 -- lg = False
layer 0 -- src = <EkMultiField src: 00:50:56:90:ca:b0>
layer 0 -- type = 2048
layer 0 -- dst = <EkMultiField dst: 00:50:56:90:19:42>
layer 0 -- ig = False


layer 1 = ['len', 'proto', 'addr', 'host', 'hdr', 'checksum', 'dsfield', 'flags', 'version', 'src', 'ttl', 'frag', 'dst', 'id']
layer 1 -- len = 60
layer 1 -- proto = 6
layer 1 -- addr = ['10.149.48.102', '10.149.48.122']
layer 1 -- host = ['10.149.48.102', '10.149.48.122']
layer 1 -- hdr = <EkMultiField hdr>
layer 1 -- checksum = <EkMultiField checksum: 55077>
layer 1 -- dsfield = <EkMultiField dsfield>
layer 1 -- flags = <EkMultiField flags: 2>
layer 1 -- version = 4
layer 1 -- src = <EkMultiField src: 10.149.48.102>
layer 1 -- ttl = 64
layer 1 -- frag = <EkMultiField frag>
layer 1 -- dst = <EkMultiField dst: 10.149.48.122>
layer 1 -- id = 60812


layer 2 = ['len', 'seq', 'window', 'nxtseq', 'hdr', '

In [23]:
# change of parameter ek to False : will output JSON

capture = pyshark.FileCapture(
    input_file=PCAPFILE,
    use_ek=False
)

pkt = capture[0]  # get first packet out of the Capture object

paquet = cbl.PyPacket(pkt) # turn it into a dictionnary with the homemade class PyPacket

In [24]:
# create the dictionnary with the fields of the first packet

paquet.data

{'ETH': {'dst': '00:50:56:90:19:42',
  'src': '00:50:56:90:ca:b0',
  'type': '0x0800'},
 'IP': {'version': '4',
  'hdr_len': '20',
  'len': '60',
  'id': '0xed8c',
  'flags': '0x02',
  'ttl': '64',
  'proto': '6',
  'src': '10.149.48.102',
  'dst': '10.149.48.122'},
 'TCP': {'srcport': '45072',
  'dstport': '18050',
  'stream': '0',
  'len': '0',
  'seq': '0',
  'ack': '0',
  'hdr_len': '40',
  'flags': '0x0002',
  'time_relative': '0.000000000',
  'time_delta': '0.000000000',
  'payload': None},
 'TIMESTAMP': {'ts': datetime.datetime(2024, 2, 19, 10, 36, 55, 868842)}}

In [25]:
# finally, outputs a dataframe

df = paquet.dataframe

df

Unnamed: 0,ETH_dst,ETH_src,ETH_type,IP_version,IP_hdr_len,IP_len,IP_id,IP_flags,IP_ttl,IP_proto,...,TCP_stream,TCP_len,TCP_seq,TCP_ack,TCP_hdr_len,TCP_flags,TCP_time_relative,TCP_time_delta,TCP_payload,TIMESTAMP_ts
0,00:50:56:90:19:42,00:50:56:90:ca:b0,0x0800,4,20,60,0xed8c,0x02,64,6,...,0,0,0,0,40,0x0002,0.0,0.0,,2024-02-19 10:36:55.868842


### Dataframe creations out of the pcap file

In [26]:
# run the 3 pcaps into a dataframe and save them

file_dict = {
    'css.pcap' : '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/traffic_cortex_seul_TM+TC.pcap',
    # 'test.pcap' : '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/test.pcap',
    # 'smallFlows.pcap' : '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/smallFlows.pcap',
    # 'bigFlows.pcap' : '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/bigFlows.pcap'   # large and takes time
}

DIRPATH = '/home/benjamin/Folders_Python/Cyber/data/dataframes/'

In [27]:
for filename, filepath in file_dict.items():
    print(f"processing {filename}")
    
    capture = pyshark.FileCapture(
        input_file=filepath,
        use_ek=False
        )
    
    df_full = pd.DataFrame()
    i=0

    while True:
        try:
            pkt = capture.next()
            paquet = cbl.PyPacket(pkt)
            df = paquet.dataframe
            df_full = pd.concat([df_full, df], axis=0)
            i += 1
            print (f'processing packet number {i}', end='\r')
        except StopIteration as e:
            logger.info(f"reached end of capture after reading {i} packets")
            break
        
    df_full = df_full.reset_index(drop=True)
    
    savename = DIRPATH + filename + '.pkl'
    df_full.to_pickle(savename)
    print('\n')
    print(f'saving {filename} as pickle')

processing css.pcap
processing packet number 1636

saving css.pcap as pickle
