## Import *.pcap par tshark, labellisaton par Suricata, ML

### Set-Up

In [1]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging
import collections
import seaborn as sns
from pprint import pprint
import os
import json
import shutil

# import ipywidgets
# import warnings

# import pyshark
# import networkx as nx

# from sklearn.preprocessing import OrdinalEncoder, StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans, DBSCAN
# from sklearn.manifold import TSNE
# from sklearn.metrics import pairwise_distances, silhouette_score
# from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
# import umap

# from itertools import product

# PATH change to access library cyberlib
import sys
sys.path.append('/home/benjamin/Folders_Python/Cyber/libs')
import cyberlib as cbl

# to allow PyShark to run in Jupyter notebooks
# import nest_asyncio
# nest_asyncio.apply()

In [2]:
# logging set-up

lg = cbl.GetLogger('/home/benjamin/Folders_Python/Cyber/logs/pcap_labellisation.log')
logger = lg.get_custom_logger()

# start your engine
logger.info("-------- new run --------")

### Import *pcap by tshark, export to *.csv then DataFrame

In [3]:
# which *.pcap
DFNAME = 'smallFlows'  # file to translate to csv (NB : no *.pcap extension)

DATA_INPUT = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/'
PCAPFILE = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/' + DFNAME + '.pcap'

DATA_OUTPUT = '/home/benjamin/Folders_Python/Cyber/data/outputs'

In [4]:
# We use the tshark CLI to parse the *.pcap file and output a *.csv file for pandas
# doc here : https://www.wireshark.org/docs/man-pages/tshark.html

# for time exports : https://osqa-ask.wireshark.org/questions/30393/tshark-how-to-output-date-in-iso-format/
# NB : outputs times in UTC to avoid time zone mismatches
# -------> there is column 'Date' added in Wireshark preferences, with output in UTC day and time
# -------> output -t ud requested in tshark : forces otuput in UTC

# tshark :
# -r                :   reads the *.pcap
# -2, -R "tcp"      :   does 2 passes and keeps packets part of TCP conversations only
# -T fields         :   outputs a file with fields
# -E header=y       :   keeps the fields names on first row for pd.read_csv
# -E separator=,    :   for *.csv format
# -e <fields>       :   desired output fields
# -o                :   formats of the data in the fields

cli="tshark -r " + PCAPFILE + """ -2 \
    -R "tcp" \
    -T fields -E header=y -E separator=, \
    -e _ws.col.Date -t ud \
    -e frame.number \
    -e eth.src -e eth.dst \
    -e ip.src_host -e ip.dst_host \
    -e ip.len -e ip.hdr_len -e ip.ttl \
    -e tcp.srcport -e tcp.dstport -e tcp.stream -e tcp.len \
    -e tcp.seq -e tcp.ack -e tcp.hdr_len -e tcp.time_relative \
    -e tcp.time_delta \
    -e tcp.flags \
    -o 'gui.column.format:"No","%m","Date","%t","Source","%s","Destination","%d","Protocol","%p","Length","%L","Info","%i"' \
    > ~/Folders_Python/Cyber/data/input_pcaps/to_csv/output.csv"""

%time exit_code = os.system(cli)

if exit_code == 0:
    logger.info('Executed successfully *.pcap to *.csv translation with tshark')
else:
    logger.error('Error while using tshark to translate from *.pcap to *.csv')
    raise NameError('Error while using tshark to translate from *.pcap to *.csv')

CPU times: user 1.31 ms, sys: 154 µs, total: 1.46 ms
Wall time: 11 s


In [5]:
src = DATA_INPUT + 'to_csv/output.csv'
dst = DATA_INPUT + 'to_csv/' + DFNAME + '.csv'
shutil.copyfile(src, dst)

'/home/benjamin/Folders_Python/Cyber/data/input_pcaps/to_csv/smallFlows.csv'

In [6]:
filename = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/to_csv/' + DFNAME + '.csv'

with open(file=filename, encoding='utf-8') as f:
    df_raw = pd.read_csv(
        f,
        header=0,               # using first row as columns names. they are exported by tshark -E header=y
        on_bad_lines='warn'     # if a line does not have the right length, skip it but warn me
        )

In [7]:
df_raw['DateTime'] = pd.to_datetime(df_raw['_ws.col.Date'], utc=True)

df_raw.drop(columns=['_ws.col.Date'], inplace=True)

df_raw

Unnamed: 0,frame.number,eth.src,eth.dst,ip.src_host,ip.dst_host,ip.len,ip.hdr_len,ip.ttl,tcp.srcport,tcp.dstport,tcp.stream,tcp.len,tcp.seq,tcp.ack,tcp.hdr_len,tcp.time_relative,tcp.time_delta,tcp.flags,DateTime
0,1,40:61:86:9a:f1:f5,00:1a:8c:15:f9:80,192.168.3.131,72.14.213.138,983,20,128,57011,80,0,943,1,1,20,0.000000,0.000000,0x0018,2011-01-25 18:52:22.484409+00:00
1,2,00:1a:8c:15:f9:80,40:61:86:9a:f1:f5,72.14.213.138,192.168.3.131,426,20,52,80,57011,0,386,1,944,20,0.029841,0.029841,0x0018,2011-01-25 18:52:22.514250+00:00
2,3,40:61:86:9a:f1:f5,00:1a:8c:15:f9:80,192.168.3.131,72.14.213.102,52,20,128,55950,80,1,0,0,0,32,0.000000,0.000000,0x0002,2011-01-25 18:52:22.708292+00:00
3,4,40:61:86:9a:f1:f5,00:1a:8c:15:f9:80,192.168.3.131,72.14.213.138,40,20,128,57011,80,0,0,944,387,20,0.229423,0.199582,0x0010,2011-01-25 18:52:22.713832+00:00
4,5,00:1a:8c:15:f9:80,40:61:86:9a:f1:f5,72.14.213.102,192.168.3.131,52,20,52,80,55950,1,0,0,1,32,0.018766,0.018766,0x0012,2011-01-25 18:52:22.727058+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13703,13704,08:00:27:cc:3f:1b,52:54:00:12:35:02,10.0.2.15,65.55.15.244,40,20,128,2537,5480,407,0,5039,5738,20,71.195375,66.560501,0x0014,2011-01-25 18:57:20.768701+00:00
13704,13705,08:00:27:cc:3f:1b,52:54:00:12:35:02,10.0.2.15,207.46.105.186,40,20,128,2540,5480,409,0,398,93,20,70.606228,5.540471,0x0014,2011-01-25 18:57:20.768769+00:00
13705,13706,08:00:27:cc:3f:1b,52:54:00:12:35:02,10.0.2.15,96.17.8.49,40,20,128,2547,5480,419,0,496,8189,20,64.405045,64.259982,0x0014,2011-01-25 18:57:20.768861+00:00
13706,13707,08:00:27:cc:3f:1b,52:54:00:12:35:02,10.0.2.15,91.103.140.2,40,20,128,2546,5480,417,0,525,270,20,64.884164,64.357688,0x0014,2011-01-25 18:57:20.768911+00:00


### Labellisation by Suricata

In [None]:
SURICATA_EVE_LOG = DATA_OUTPUT + '/eve.json'

# delete existing eve.json file if it exists, suricata would append data otherwise
if os.path.isfile(SURICATA_EVE_LOG):
    os.remove(SURICATA_EVE_LOG)

In [None]:
# run Suricata and generate the log file in the EVE.JSON output file
cli="suricata -r " + PCAPFILE + " tcp " + " -l " + DATA_OUTPUT + " -k none"  # discard invalid checksum alert

%time exit_code = os.system(cli)

if exit_code == 0:
    logger.info('Executed successfully *.pcap to EVE.json translation with suricata')
else:
    logger.error('Error while using suricata to analyse from *.pcap to EVE.json')
    raise NameError('Error while using suricata to analyse from *.pcap to EVE.json')

In [None]:
# Pandas provides a useful method – json_normalize – for normalizing nested JSON fields into dataframe. Resulting columns use dot notation to signify nested objects, similar to how Elasticsearch does it

with open (SURICATA_EVE_LOG) as packets:
    df_log = pd.json_normalize(
        [json.loads(packet) for packet in packets],
        max_level=1
    )
    
df_log

In [None]:
df_log['event_type'].unique()

In [None]:
print(f"{len(df_log[df_log['event_type'] == 'anomaly'])} anomalie(s)")
print(f"{len(df_log[df_log['event_type'] == 'alert'])} alert(e)")

n_alerts = len(df_log[df_log['event_type'] == 'alert'])
n_anomalies = len(df_log[df_log['event_type'] == 'anomaly'])

logger.info(f'Found {n_alerts} alert(s) in *.json')
logger.info(f'Found {n_anomalies} anomalie(s) in *.json')

if n_alerts == 0:
    msg = 'no alert found in suricata file'
    logger.error(msg)
    raise NameError(msg)

In [None]:
# extract anomalies out of the whole log, put it in a specific dataframe
df_interest = df_log[df_log['event_type']=='alert']

# post-process : convert timestamp to UTC-datetime, set index
df_interest['DateTime'] = pd.to_datetime(df_interest['timestamp'],utc=True)
df_interest.drop(columns=['timestamp'], inplace=True)
df_interest.set_index(keys='DateTime', drop=False, inplace=True)

df_interest

In [None]:
df_interest.columns

In [None]:
df_interest['alert.category'].unique()

In [None]:
# idée : 
# 1. parcourir les DateTime du sous-ensemble des anomalies détectées par Suricata
# 2. regarder s'il y a un paquet avec ce timestamp exact dans l'extraction tshark
# 3. si oui : flagger y=1 le paquet dans la df tshark (et rajouter les champs d'explication)
# 4. si non : logger une anomalie orpheline

In [None]:
df = df_raw.copy()
df = df.set_index(keys='DateTime', drop=False)
df['y'] = 0  # set number of anomalies per *.pcap packet

df

In [None]:
ctr_alerts = 0

for anomaly_datetime in df_interest.index:
    df.loc[anomaly_datetime, 'y'] = df.loc[anomaly_datetime, 'y'] + 1
    ctr_alerts += 1
    
print(f'compté {ctr_alerts} alerte(s)')
print(f"assigned {df['y'].sum()} alerte(s)")

df['y'].unique()  # Il peut y avoir plusieurs alertes par paquet

In [None]:
# export final en *.csv

DF_OUTPUT_DIR = '/home/benjamin/Folders_Python/Cyber/data/dataframes/'
DF_FILENAME = DF_OUTPUT_DIR + 'df_' + DFNAME + '.csv'

with open(DF_FILENAME, 'w') as f:
    df.to_csv(f)