# Basic manipulations with PyShark

### Here, we use PyShark to parse through pcap packets, build dataframes and store them in Pickle format

In [1]:
# PATH change to access library

import sys
sys.path.append('/home/benjamin/Folders_Python/Cyber/libs')

# Imports

import numpy as np
import pandas as pd
import pyshark
import matplotlib.pyplot as plt
import json # original json library
import logging

# Home made library

import cyberlib as cbl

In [2]:
# logging set-up for debug

LOG_FILENAME = '/home/benjamin/Folders_Python/Cyber/logs/logfile.log'
LOG_FORMAT = '%(asctime)% -- %(name)s -- %(levelname)s -- %(message)s'
# LOG_LEVEL = logging.INFO

# specific logger for the module
logger = logging.getLogger(__name__)   # creates specific logger for the module
logger.setLevel(logging.DEBUG)    # entry level of messages from all handlers
LOG_FORMAT = '%(asctime)s -- %(name)s -- %(levelname)s -- %(message)s'
formatter = logging.Formatter(LOG_FORMAT)

# file handler to log everything
file_handler = logging.FileHandler(LOG_FILENAME, mode='w')
file_handler.setLevel(logging.DEBUG)  # all messages (DEBUG and up) get logged in the file
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# stream handler to show messages to the console
console = logging.StreamHandler()
console.setLevel(logging.WARNING)  # Warning messages and up get displayed to the console
console.setFormatter(formatter)
logger.addHandler(console)

# start your engine
logger.info("-------- new run --------")

In [3]:
PCAPFILE = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/smallFlows.pcap'

logger.info(f'-- entry pcap file = {PCAPFILE} --')

In [4]:
# pcap file capture
# ek is set to True : newline delimited JSON format
# https://www.wireshark.org/docs/man-pages/tshark.html

capture = pyshark.FileCapture(
    input_file=PCAPFILE,
    use_ek=True
)

pkt = capture[0]

In [5]:
# what does a PyShark packet look like

dir(pkt)

['__bool__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_packet_string',
 'captured_length',
 'eth',
 'frame_info',
 'get_multiple_layers',
 'get_raw_packet',
 'highest_layer',
 'http',
 'interface_captured',
 'ip',
 'layers',
 'length',
 'number',
 'pretty_print',
 'show',
 'sniff_time',
 'sniff_timestamp',
 'tcp',
 'transport_layer']

In [6]:
# layers in the first packet

list_layers = pkt.layers

print(list_layers)

[<ETH Layer>, <IP Layer>, <TCP Layer>, <HTTP Layer>]


In [7]:
# printing one level below

for i,layer in enumerate(list_layers):
    fields_names = layer.field_names
    print(f'layer {i} = {fields_names}')
    for field in fields_names:
        print(f"layer {i} -- {field} = {layer.get(field)}")
    print(f'\n')

layer 0 = ['ig', 'lg', 'addr', 'src', 'dst', 'type']
layer 0 -- ig = False
layer 0 -- lg = False
layer 0 -- addr = <EkMultiField addr: 40:61:86:9a:f1:f5>
layer 0 -- src = <EkMultiField src: 40:61:86:9a:f1:f5>
layer 0 -- dst = <EkMultiField dst: 00:1a:8c:15:f9:80>
layer 0 -- type = 2048


layer 1 = ['flags', 'checksum', 'dsfield', 'host', 'len', 'addr', 'proto', 'id', 'hdr', 'version', 'src', 'dst', 'frag', 'ttl']
layer 1 -- flags = <EkMultiField flags: 64>
layer 1 -- checksum = <EkMultiField checksum: 40572>
layer 1 -- dsfield = <EkMultiField dsfield>
layer 1 -- host = ['192.168.3.131', '72.14.213.138']
layer 1 -- len = 983
layer 1 -- addr = ['192.168.3.131', '72.14.213.138']
layer 1 -- proto = 6
layer 1 -- id = 30432
layer 1 -- hdr = <EkMultiField hdr>
layer 1 -- version = 4
layer 1 -- src = <EkMultiField src: 192.168.3.131>
layer 1 -- dst = <EkMultiField dst: 72.14.213.138>
layer 1 -- frag = <EkMultiField frag>
layer 1 -- ttl = 128


layer 2 = ['seq', 'window', 'urgent', 'payload', '

In [8]:
# change of parameter ek to False : will output JSON

capture = pyshark.FileCapture(
    input_file=PCAPFILE,
    use_ek=False
)

pkt = capture[0]  # get first packet out of the Capture object

paquet = cbl.PyPacket(pkt) # turn it into a dictionnary with the homemade class PyPacket

In [9]:
# create the dictionnary with the fields of the first packet

paquet.data

{'ETH': {'dst': '00:1a:8c:15:f9:80',
  'src': '40:61:86:9a:f1:f5',
  'type': '0x00000800'},
 'IP': {'version': '4',
  'hdr_len': '20',
  'len': '983',
  'id': '0x000076e0',
  'flags': '0x00000040',
  'ttl': '128',
  'proto': '6',
  'src': '192.168.3.131',
  'dst': '72.14.213.138'},
 'TCP': {'srcport': '57011',
  'dstport': '80',
  'stream': '0',
  'len': '943',
  'seq': '1',
  'ack': '1',
  'hdr_len': '20',
  'flags': '0x00000018',
  'time_relative': '0.000000000',
  'time_delta': '0.000000000',
  'payload': '47:45:54:20:2f:63:6f:6d:70:6c:65:74:65:2f:73:65:61:72:63:68:3f:63:6c:69:65:6e:74:3d:63:68:72:6f:6d:65:26:68:6c:3d:65:6e:2d:55:53:26:71:3d:63:72:20:48:54:54:50:2f:31:2e:31:0d:0a:48:6f:73:74:3a:20:63:6c:69:65:6e:74:73:31:2e:67:6f:6f:67:6c:65:2e:63:61:0d:0a:43:6f:6e:6e:65:63:74:69:6f:6e:3a:20:6b:65:65:70:2d:61:6c:69:76:65:0d:0a:55:73:65:72:2d:41:67:65:6e:74:3a:20:4d:6f:7a:69:6c:6c:61:2f:35:2e:30:20:28:57:69:6e:64:6f:77:73:3b:20:55:3b:20:57:69:6e:64:6f:77:73:20:4e:54:20:36:2e:31:3b:20

In [10]:
# finally, outputs a dataframe

df = paquet.dataframe

df

Unnamed: 0,ETH_dst,ETH_src,ETH_type,IP_version,IP_hdr_len,IP_len,IP_id,IP_flags,IP_ttl,IP_proto,...,TCP_stream,TCP_len,TCP_seq,TCP_ack,TCP_hdr_len,TCP_flags,TCP_time_relative,TCP_time_delta,TCP_payload,TIMESTAMP_ts
0,00:1a:8c:15:f9:80,40:61:86:9a:f1:f5,0x00000800,4,20,983,0x000076e0,0x00000040,128,6,...,0,943,1,1,20,0x00000018,0.0,0.0,47:45:54:20:2f:63:6f:6d:70:6c:65:74:65:2f:73:6...,2011-01-25 19:52:22.484409


### Dataframe creations out of the pcap file

In [11]:
# run the 3 pcaps into a dataframe and save them

file_dict = {
    'test.pcap' : '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/test.pcap',
    'smallFlows.pcap' : '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/smallFlows.pcap',
    # 'bigFlows.pcap' : '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/bigFlows.pcap'   # large and takes time
}

DIRPATH = '/home/benjamin/Folders_Python/Cyber/data/dataframes/'

In [12]:
for filename, filepath in file_dict.items():
    print(f"processing {filename}")
    
    capture = pyshark.FileCapture(
        input_file=filepath,
        use_ek=False
        )
    
    df_full = pd.DataFrame()
    i=0

    while True:
        try:
            pkt = capture.next()
            paquet = cbl.PyPacket(pkt)
            df = paquet.dataframe
            df_full = pd.concat([df_full, df], axis=0)
            i += 1
            print (f'processing packet number {i}', end='\r')
        except StopIteration as e:
            logger.info(f"reached end of capture after reading {i} packets")
            break
        
    df_full = df_full.reset_index(drop=True)
    
    savename = DIRPATH + filename + '.pkl'
    df_full.to_pickle(savename)
    print('\n')
    print(f'saving {filename} as pickle')

processing test.pcap
processing packet number 141

saving test.pcap as pickle
processing smallFlows.pcap
processing packet number 14261

saving smallFlows.pcap as pickle
