## Import *.pcap par tshark, labellisaton par Suricata, ML

### Set-Up

In [46]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging
import collections
import seaborn as sns
from pprint import pprint
import os
import json
import shutil

# import ipywidgets
# import warnings

# import pyshark
# import networkx as nx

# from sklearn.preprocessing import OrdinalEncoder, StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans, DBSCAN
# from sklearn.manifold import TSNE
# from sklearn.metrics import pairwise_distances, silhouette_score
# from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
# import umap

# from itertools import product

# PATH change to access library cyberlib
import sys
sys.path.append('/home/benjamin/Folders_Python/Cyber/libs')
import cyberlib as cbl

# to allow PyShark to run in Jupyter notebooks
# import nest_asyncio
# nest_asyncio.apply()

In [47]:
# logging set-up

lg = cbl.GetLogger('/home/benjamin/Folders_Python/Cyber/logs/pcap_labellisation.log')
logger = lg.get_custom_logger()

# start your engine
logger.info("-------- new run --------")

### Import *pcap by tshark, export to *.csv then DataFrame

In [48]:
# which *.pcap
DFNAME = 'home'  # file to translate to csv

DATA_INPUT = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/'
PCAPFILE = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/' + DFNAME + '.pcap'

DATA_OUTPUT = '/home/benjamin/Folders_Python/Cyber/data/outputs'

In [49]:
# We use the tshark CLI to parse the *.pcap file and output a *.csv file for pandas
# doc here : https://www.wireshark.org/docs/man-pages/tshark.html

# for time exports : https://osqa-ask.wireshark.org/questions/30393/tshark-how-to-output-date-in-iso-format/
# NB : outputs times in UTC to avoid time zone mismatches
# -------> there is column 'Date' added in Wireshark preferences, with output in UTC day and time
# -------> output -t ud requested in tshark : forces otuput in UTC

# tshark :
# -r                :   reads the *.pcap
# -2, -R "tcp"      :   does 2 passes and keeps packets part of TCP conversations only
# -T fields         :   outputs a file with fields
# -E header=y       :   keeps the fields names on first row for pd.read_csv
# -E separator=,    :   for *.csv format
# -e <fields>       :   desired output fields
# -o                :   formats of the data in the fields

cli="tshark -r " + PCAPFILE + """ -2 \
    -R "tcp" \
    -T fields -E header=y -E separator=, \
    -e _ws.col.Date -t ud \
    -e frame.number \
    -e eth.src -e eth.dst \
    -e ip.src_host -e ip.dst_host \
    -e ip.len -e ip.hdr_len -e ip.ttl \
    -e tcp.srcport -e tcp.dstport -e tcp.stream -e tcp.len \
    -e tcp.seq -e tcp.ack -e tcp.hdr_len -e tcp.time_relative \
    -e tcp.time_delta \
    -e tcp.flags \
    -o 'gui.column.format:"No","%m","Date","%t","Source","%s","Destination","%d","Protocol","%p","Length","%L","Info","%i"' \
    > ~/Folders_Python/Cyber/data/input_pcaps/to_csv/output.csv"""

%time exit_code = os.system(cli)

if exit_code == 0:
    logger.info('Executed successfully *.pcap to *.csv translation with tshark')
else:
    logger.error('Error while using tshark to translate from *.pcap to *.csv')
    raise NameError('Error while using tshark to translate from *.pcap to *.csv')

CPU times: user 0 ns, sys: 1.07 ms, total: 1.07 ms
Wall time: 612 ms


In [50]:
src = DATA_INPUT + 'to_csv/output.csv'
dst = DATA_INPUT + 'to_csv/' + DFNAME + '.csv'
shutil.copyfile(src, dst)

'/home/benjamin/Folders_Python/Cyber/data/input_pcaps/to_csv/home.csv'

In [51]:
filename = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/to_csv/' + DFNAME + '.csv'

with open(file=filename, encoding='utf-8') as f:
    df_raw = pd.read_csv(
        f,
        header=0,               # using first row as columns names. they are exported by tshark -E header=y
        on_bad_lines='warn'     # if a line does not have the right length, skip it but warn me
        )

In [52]:
df_raw['DateTime'] = pd.to_datetime(df_raw['_ws.col.Date'], utc=True)

df_raw.drop(columns=['_ws.col.Date'], inplace=True)

df_raw

Unnamed: 0,frame.number,eth.src,eth.dst,ip.src_host,ip.dst_host,ip.len,ip.hdr_len,ip.ttl,tcp.srcport,tcp.dstport,tcp.stream,tcp.len,tcp.seq,tcp.ack,tcp.hdr_len,tcp.time_relative,tcp.time_delta,tcp.flags,DateTime
0,1,a4:5d:36:5a:fe:7c,5c:fa:25:41:fc:90,,,,,,36100,443,0,0,1,1,32,0.000000,0.000000,0x0010,2023-06-17 08:46:05.765744+00:00
1,2,a4:5d:36:5a:fe:7c,5c:fa:25:41:fc:90,,,,,,36104,443,1,0,1,1,32,0.000000,0.000000,0x0010,2023-06-17 08:46:05.765752+00:00
2,3,a4:5d:36:5a:fe:7c,5c:fa:25:41:fc:90,,,,,,36106,443,2,0,1,1,32,0.000000,0.000000,0x0010,2023-06-17 08:46:05.765754+00:00
3,4,a4:5d:36:5a:fe:7c,5c:fa:25:41:fc:90,,,,,,36120,443,3,0,1,1,32,0.000000,0.000000,0x0010,2023-06-17 08:46:05.765756+00:00
4,5,5c:fa:25:41:fc:90,a4:5d:36:5a:fe:7c,,,,,,443,36120,3,0,1,2,32,0.011373,0.011373,0x0010,2023-06-17 08:46:05.777129+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
858,859,5c:fa:25:41:fc:90,a4:5d:36:5a:fe:7c,,,,,,443,54494,7,0,1,34,32,64.846553,0.001016,0x0010,2023-06-17 08:47:12.148304+00:00
859,860,5c:fa:25:41:fc:90,a4:5d:36:5a:fe:7c,,,,,,443,54494,7,31,1,34,32,64.847039,0.000486,0x0018,2023-06-17 08:47:12.148790+00:00
860,861,a4:5d:36:5a:fe:7c,5c:fa:25:41:fc:90,,,,,,54494,443,7,0,34,0,20,64.847066,0.000027,0x0004,2023-06-17 08:47:12.148817+00:00
861,862,5c:fa:25:41:fc:90,a4:5d:36:5a:fe:7c,,,,,,443,54494,7,0,32,34,32,64.847554,0.000488,0x0011,2023-06-17 08:47:12.149305+00:00


### Labellisation by Suricata

In [53]:
SURICATA_EVE_LOG = DATA_OUTPUT + '/eve.json'

# delete existing eve.json file if it exists, suricata would append data otherwise
if os.path.isfile(SURICATA_EVE_LOG):
    os.remove(SURICATA_EVE_LOG)

In [54]:
# run Suricata and generate the log file in the EVE.JSON output file
cli="suricata -r " + PCAPFILE + " tcp " + " -l " + DATA_OUTPUT # + " -k none"

%time exit_code = os.system(cli)

if exit_code == 0:
    logger.info('Executed successfully *.pcap to EVE.json translation with suricata')
else:
    logger.error('Error while using suricata to analyse from *.pcap to EVE.json')
    raise NameError('Error while using suricata to analyse from *.pcap to EVE.json')

22/8/2023 -- 06:52:10 - <Notice> - This is Suricata version 6.0.10 RELEASE running in USER mode
22/8/2023 -- 06:52:10 - <Notice> - all 5 packet processing threads, 4 management threads initialized, engine started.
22/8/2023 -- 06:52:11 - <Notice> - Signal Received.  Stopping engine.
22/8/2023 -- 06:52:11 - <Notice> - Pcap-file module read 1 files, 863 packets, 1243626 bytes
CPU times: user 0 ns, sys: 5.07 ms, total: 5.07 ms
Wall time: 240 ms


In [55]:
# Pandas provides a useful method – json_normalize – for normalizing nested JSON fields into dataframe. Resulting columns use dot notation to signify nested objects, similar to how Elasticsearch does it

with open (SURICATA_EVE_LOG) as packets:
    df_log = pd.json_normalize(
        [json.loads(packet) for packet in packets],
        max_level=1
    )
    
df_log

Unnamed: 0,timestamp,flow_id,pcap_cnt,event_type,src_ip,src_port,dest_ip,dest_port,proto,anomaly.type,...,stats.decoder,stats.flow,stats.defrag,stats.flow_bypassed,stats.tcp,stats.detect,stats.app_layer,stats.http,stats.ftp,stats.file_store
0,2023-06-17T10:46:11.477617+0200,1850317000000000.0,62.0,anomaly,35.188.42.15,443.0,192.168.1.10,58384.0,TCP,stream,...,,,,,,,,,,
1,2023-06-17T10:46:11.590895+0200,1850317000000000.0,64.0,anomaly,35.188.42.15,443.0,192.168.1.10,58384.0,TCP,stream,...,,,,,,,,,,
2,2023-06-17T10:46:11.591054+0200,1850317000000000.0,65.0,anomaly,35.188.42.15,443.0,192.168.1.10,58384.0,TCP,stream,...,,,,,,,,,,
3,2023-06-17T10:46:11.591879+0200,1850317000000000.0,66.0,anomaly,35.188.42.15,443.0,192.168.1.10,58384.0,TCP,stream,...,,,,,,,,,,
4,2023-06-17T10:46:32.162357+0200,213071000000000.0,303.0,anomaly,34.255.70.162,443.0,192.168.1.10,53536.0,TCP,stream,...,,,,,,,,,,
5,2023-06-17T10:46:44.150465+0200,777668000000000.0,424.0,anomaly,2a00:1450:4007:081a:0000:0000:0000:2003,80.0,2a01:cb19:872e:3000:0e4f:3187:540c:d66c,47864.0,TCP,stream,...,,,,,,,,,,
6,2023-06-17T10:46:32.146544+0200,16339980000000.0,299.0,anomaly,104.18.26.218,443.0,192.168.1.10,57302.0,TCP,stream,...,,,,,,,,,,
7,2023-06-17T10:46:50.153673+0200,2021254000000000.0,550.0,anomaly,2a04:4e42:006a:0000:0000:0000:0000:0760,443.0,2a01:cb19:872e:3000:0e4f:3187:540c:d66c,36104.0,TCP,stream,...,,,,,,,,,,
8,2023-06-17T10:46:50.155799+0200,1338930000000000.0,555.0,anomaly,2a04:4e42:006a:0000:0000:0000:0000:0760,443.0,2a01:cb19:872e:3000:0e4f:3187:540c:d66c,36120.0,TCP,stream,...,,,,,,,,,,
9,2023-06-17T10:46:50.156479+0200,263416600000000.0,557.0,anomaly,2a00:1450:4007:081a:0000:0000:0000:2003,80.0,2a01:cb19:872e:3000:0e4f:3187:540c:d66c,47854.0,TCP,stream,...,,,,,,,,,,


In [56]:
# extract anomalies out of the whole log, put it in a specific dataframe
df_anomaly = df_log[df_log['event_type']=='anomaly']

# post-process : convert timestamp to UTC-datetime, set index
df_anomaly['DateTime'] = pd.to_datetime(df_anomaly['timestamp'],utc=True)
df_anomaly.drop(columns=['timestamp'], inplace=True)
df_anomaly.set_index(keys='DateTime', drop=False, inplace=True)

df_anomaly

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_anomaly['DateTime'] = pd.to_datetime(df_anomaly['timestamp'],utc=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_anomaly.drop(columns=['timestamp'], inplace=True)


Unnamed: 0_level_0,flow_id,pcap_cnt,event_type,src_ip,src_port,dest_ip,dest_port,proto,anomaly.type,anomaly.event,...,stats.flow,stats.defrag,stats.flow_bypassed,stats.tcp,stats.detect,stats.app_layer,stats.http,stats.ftp,stats.file_store,DateTime
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-06-17 08:46:11.477617+00:00,1850317000000000.0,62.0,anomaly,35.188.42.15,443.0,192.168.1.10,58384.0,TCP,stream,stream.fin_but_no_session,...,,,,,,,,,,2023-06-17 08:46:11.477617+00:00
2023-06-17 08:46:11.590895+00:00,1850317000000000.0,64.0,anomaly,35.188.42.15,443.0,192.168.1.10,58384.0,TCP,stream,stream.rst_but_no_session,...,,,,,,,,,,2023-06-17 08:46:11.590895+00:00
2023-06-17 08:46:11.591054+00:00,1850317000000000.0,65.0,anomaly,35.188.42.15,443.0,192.168.1.10,58384.0,TCP,stream,stream.rst_but_no_session,...,,,,,,,,,,2023-06-17 08:46:11.591054+00:00
2023-06-17 08:46:11.591879+00:00,1850317000000000.0,66.0,anomaly,35.188.42.15,443.0,192.168.1.10,58384.0,TCP,stream,stream.rst_but_no_session,...,,,,,,,,,,2023-06-17 08:46:11.591879+00:00
2023-06-17 08:46:32.162357+00:00,213071000000000.0,303.0,anomaly,34.255.70.162,443.0,192.168.1.10,53536.0,TCP,stream,stream.fin_but_no_session,...,,,,,,,,,,2023-06-17 08:46:32.162357+00:00
2023-06-17 08:46:44.150465+00:00,777668000000000.0,424.0,anomaly,2a00:1450:4007:081a:0000:0000:0000:2003,80.0,2a01:cb19:872e:3000:0e4f:3187:540c:d66c,47864.0,TCP,stream,stream.fin_but_no_session,...,,,,,,,,,,2023-06-17 08:46:44.150465+00:00
2023-06-17 08:46:32.146544+00:00,16339980000000.0,299.0,anomaly,104.18.26.218,443.0,192.168.1.10,57302.0,TCP,stream,stream.fin_but_no_session,...,,,,,,,,,,2023-06-17 08:46:32.146544+00:00
2023-06-17 08:46:50.153673+00:00,2021254000000000.0,550.0,anomaly,2a04:4e42:006a:0000:0000:0000:0000:0760,443.0,2a01:cb19:872e:3000:0e4f:3187:540c:d66c,36104.0,TCP,stream,stream.fin_but_no_session,...,,,,,,,,,,2023-06-17 08:46:50.153673+00:00
2023-06-17 08:46:50.155799+00:00,1338930000000000.0,555.0,anomaly,2a04:4e42:006a:0000:0000:0000:0000:0760,443.0,2a01:cb19:872e:3000:0e4f:3187:540c:d66c,36120.0,TCP,stream,stream.fin_but_no_session,...,,,,,,,,,,2023-06-17 08:46:50.155799+00:00
2023-06-17 08:46:50.156479+00:00,263416600000000.0,557.0,anomaly,2a00:1450:4007:081a:0000:0000:0000:2003,80.0,2a01:cb19:872e:3000:0e4f:3187:540c:d66c,47854.0,TCP,stream,stream.fin_but_no_session,...,,,,,,,,,,2023-06-17 08:46:50.156479+00:00


In [57]:
# idée : 
# 1. parcourir les DateTime du sous-ensemble des anomalies détectées par Suricata
# 2. regarder s'il y a un paquet avec ce timestamp exact dans l'extraction tshark
# 3. si oui : flagger y=1 le paquet dans la df tshark (et rajouter les champs d'explication)
# 4. si non : logger une anomalie orpheline

In [58]:
df = df_raw.copy()
df = df.set_index(keys='DateTime', drop=False)
df['y'] = 0  # set number of anomalies per *.pcap packet

df

Unnamed: 0_level_0,frame.number,eth.src,eth.dst,ip.src_host,ip.dst_host,ip.len,ip.hdr_len,ip.ttl,tcp.srcport,tcp.dstport,tcp.stream,tcp.len,tcp.seq,tcp.ack,tcp.hdr_len,tcp.time_relative,tcp.time_delta,tcp.flags,DateTime,y
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2023-06-17 08:46:05.765744+00:00,1,a4:5d:36:5a:fe:7c,5c:fa:25:41:fc:90,,,,,,36100,443,0,0,1,1,32,0.000000,0.000000,0x0010,2023-06-17 08:46:05.765744+00:00,0
2023-06-17 08:46:05.765752+00:00,2,a4:5d:36:5a:fe:7c,5c:fa:25:41:fc:90,,,,,,36104,443,1,0,1,1,32,0.000000,0.000000,0x0010,2023-06-17 08:46:05.765752+00:00,0
2023-06-17 08:46:05.765754+00:00,3,a4:5d:36:5a:fe:7c,5c:fa:25:41:fc:90,,,,,,36106,443,2,0,1,1,32,0.000000,0.000000,0x0010,2023-06-17 08:46:05.765754+00:00,0
2023-06-17 08:46:05.765756+00:00,4,a4:5d:36:5a:fe:7c,5c:fa:25:41:fc:90,,,,,,36120,443,3,0,1,1,32,0.000000,0.000000,0x0010,2023-06-17 08:46:05.765756+00:00,0
2023-06-17 08:46:05.777129+00:00,5,5c:fa:25:41:fc:90,a4:5d:36:5a:fe:7c,,,,,,443,36120,3,0,1,2,32,0.011373,0.011373,0x0010,2023-06-17 08:46:05.777129+00:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-17 08:47:12.148304+00:00,859,5c:fa:25:41:fc:90,a4:5d:36:5a:fe:7c,,,,,,443,54494,7,0,1,34,32,64.846553,0.001016,0x0010,2023-06-17 08:47:12.148304+00:00,0
2023-06-17 08:47:12.148790+00:00,860,5c:fa:25:41:fc:90,a4:5d:36:5a:fe:7c,,,,,,443,54494,7,31,1,34,32,64.847039,0.000486,0x0018,2023-06-17 08:47:12.148790+00:00,0
2023-06-17 08:47:12.148817+00:00,861,a4:5d:36:5a:fe:7c,5c:fa:25:41:fc:90,,,,,,54494,443,7,0,34,0,20,64.847066,0.000027,0x0004,2023-06-17 08:47:12.148817+00:00,0
2023-06-17 08:47:12.149305+00:00,862,5c:fa:25:41:fc:90,a4:5d:36:5a:fe:7c,,,,,,443,54494,7,0,32,34,32,64.847554,0.000488,0x0011,2023-06-17 08:47:12.149305+00:00,0


In [59]:
ctr_anomalies = 0

for anomaly_datetime in df_anomaly.index:
    df.loc[anomaly_datetime, 'y'] = df.loc[anomaly_datetime, 'y'] + 1
    ctr_anomalies += 1
    
print(f'compté {ctr_anomalies} anomalies')
print(f"assigned {df['y'].sum()} anomalies")

df['y'].unique()  # Il peut y avoir plusieurs anomalies par paquet

compté 24 anomalies
assigned 24 anomalies


array([0, 1])

In [60]:
# export final en *.csv

DF_OUTPUT_DIR = '/home/benjamin/Folders_Python/Cyber/data/dataframes/'
DF_FILENAME = DF_OUTPUT_DIR + 'df_' + DFNAME + '.csv'

with open(DF_FILENAME, 'w') as f:
    df.to_csv(f)