## Import *.pcap par tshark, labellisaton par Suricata, ML

### Set-Up

In [61]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging
import collections
import seaborn as sns
from pprint import pprint
import os
import json
import shutil

# import ipywidgets
# import warnings

# import pyshark
# import networkx as nx

# from sklearn.preprocessing import OrdinalEncoder, StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans, DBSCAN
# from sklearn.manifold import TSNE
# from sklearn.metrics import pairwise_distances, silhouette_score
# from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
# import umap

# from itertools import product

# PATH change to access library cyberlib
import sys
sys.path.append('/home/benjamin/Folders_Python/Cyber/libs')
import cyberlib as cbl

# to allow PyShark to run in Jupyter notebooks
# import nest_asyncio
# nest_asyncio.apply()

In [62]:
# logging set-up

lg = cbl.GetLogger('/home/benjamin/Folders_Python/Cyber/logs/pcap_labellisation.log')
logger = lg.get_custom_logger()

# start your engine
logger.info("-------- new run --------")

### Import *pcap by tshark, export to *.csv then DataFrame

In [63]:
# which *.pcap
DFNAME = 'bigFlows'  # file to translate to csv

DATA_INPUT = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/'
PCAPFILE = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/' + DFNAME + '.pcap'

DATA_OUTPUT = '/home/benjamin/Folders_Python/Cyber/data/outputs'

In [64]:
# We use the tshark CLI to parse the *.pcap file and output a *.csv file for pandas
# doc here : https://www.wireshark.org/docs/man-pages/tshark.html

# for time exports : https://osqa-ask.wireshark.org/questions/30393/tshark-how-to-output-date-in-iso-format/
# NB : outputs times in UTC to avoid time zone mismatches
# -------> there is column 'Date' added in Wireshark preferences, with output in UTC day and time
# -------> output -t ud requested in tshark : forces otuput in UTC

# tshark :
# -r                :   reads the *.pcap
# -2, -R "tcp"      :   does 2 passes and keeps packets part of TCP conversations only
# -T fields         :   outputs a file with fields
# -E header=y       :   keeps the fields names on first row for pd.read_csv
# -E separator=,    :   for *.csv format
# -e <fields>       :   desired output fields
# -o                :   formats of the data in the fields

cli="tshark -r " + PCAPFILE + """ -2 \
    -R "tcp" \
    -T fields -E header=y -E separator=, \
    -e _ws.col.Date -t ud \
    -e frame.number \
    -e eth.src -e eth.dst \
    -e ip.src_host -e ip.dst_host \
    -e ip.len -e ip.hdr_len -e ip.ttl \
    -e tcp.srcport -e tcp.dstport -e tcp.stream -e tcp.len \
    -e tcp.seq -e tcp.ack -e tcp.hdr_len -e tcp.time_relative \
    -e tcp.time_delta \
    -e tcp.flags \
    -o 'gui.column.format:"No","%m","Date","%t","Source","%s","Destination","%d","Protocol","%p","Length","%L","Info","%i"' \
    > ~/Folders_Python/Cyber/data/input_pcaps/to_csv/output.csv"""

%time exit_code = os.system(cli)

if exit_code == 0:
    logger.info('Executed successfully *.pcap to *.csv translation with tshark')
else:
    logger.error('Error while using tshark to translate from *.pcap to *.csv')
    raise NameError('Error while using tshark to translate from *.pcap to *.csv')

CPU times: user 2.63 ms, sys: 0 ns, total: 2.63 ms
Wall time: 1min 28s


In [65]:
src = DATA_INPUT + 'to_csv/output.csv'
dst = DATA_INPUT + 'to_csv/' + DFNAME + '.csv'
shutil.copyfile(src, dst)

'/home/benjamin/Folders_Python/Cyber/data/input_pcaps/to_csv/bigFlows.csv'

In [66]:
filename = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/to_csv/' + DFNAME + '.csv'

with open(file=filename, encoding='utf-8') as f:
    df_raw = pd.read_csv(
        f,
        header=0,               # using first row as columns names. they are exported by tshark -E header=y
        on_bad_lines='warn'     # if a line does not have the right length, skip it but warn me
        )

Skipping line 372: expected 19 fields, saw 129
Skipping line 1129: expected 19 fields, saw 120
Skipping line 1759: expected 19 fields, saw 104
Skipping line 2436: expected 19 fields, saw 104
Skipping line 2886: expected 19 fields, saw 120
Skipping line 3485: expected 19 fields, saw 129
Skipping line 4287: expected 19 fields, saw 102
Skipping line 4878: expected 19 fields, saw 102
Skipping line 5509: expected 19 fields, saw 109
Skipping line 6226: expected 19 fields, saw 113
Skipping line 6927: expected 19 fields, saw 145
Skipping line 7490: expected 19 fields, saw 77
Skipping line 8208: expected 19 fields, saw 129
Skipping line 8892: expected 19 fields, saw 102
Skipping line 9566: expected 19 fields, saw 129
Skipping line 10142: expected 19 fields, saw 120
Skipping line 10810: expected 19 fields, saw 102
Skipping line 11427: expected 19 fields, saw 122
Skipping line 12007: expected 19 fields, saw 93
Skipping line 12597: expected 19 fields, saw 104
Skipping line 13041: expected 19 field

In [67]:
df_raw['DateTime'] = pd.to_datetime(df_raw['_ws.col.Date'], utc=True)

df_raw.drop(columns=['_ws.col.Date'], inplace=True)

df_raw

Unnamed: 0,frame.number,eth.src,eth.dst,ip.src_host,ip.dst_host,ip.len,ip.hdr_len,ip.ttl,tcp.srcport,tcp.dstport,tcp.stream,tcp.len,tcp.seq,tcp.ack,tcp.hdr_len,tcp.time_relative,tcp.time_delta,tcp.flags,DateTime
0,1,00:90:7f:3e:02:d0,00:21:70:63:3b:ad,96.43.146.176,172.16.133.82,40,20,242,443,61228,0,0,1,1,20,0.000000,0.000000,0x0010,2013-02-26 22:02:35.953494+00:00
1,2,00:90:7f:3e:02:d0,00:21:70:63:3b:ad,96.43.146.176,172.16.133.82,40,20,242,443,61228,0,0,1,1107,20,0.000985,0.000985,0x0010,2013-02-26 22:02:35.954479+00:00
2,3,00:90:7f:3e:02:d0,00:21:70:63:3b:ad,96.43.146.176,172.16.133.82,40,20,241,443,60073,1,0,1,1,20,0.000000,0.000000,0x0010,2013-02-26 22:02:35.958435+00:00
3,4,00:21:70:67:6a:e7,00:90:7f:3e:02:d0,172.16.133.103,216.115.222.200,52,20,128,63406,443,2,0,0,0,32,0.000000,0.000000,0x0002,2013-02-26 22:02:35.959911+00:00
4,5,00:21:70:63:3c:4a,00:90:7f:3e:02:d0,172.16.133.43,172.16.139.250,251,20,128,57700,5440,3,211,1,1,20,0.000000,0.000000,0x0018,2013-02-26 22:02:35.962163+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633889,635013,00:90:7f:3e:02:d0,00:21:70:67:32:60,208.111.161.254,172.16.133.63,40,20,55,80,54278,22280,0,7383,5566,20,0.452463,0.024845,0x0010,2013-02-26 22:07:35.949374+00:00
633890,635014,00:90:7f:3e:02:d0,00:21:70:67:32:60,208.111.161.254,172.16.133.63,1500,20,55,80,54280,22282,1460,8704,6679,20,0.452032,0.026218,0x0010,2013-02-26 22:07:35.950010+00:00
633891,635015,00:90:7f:3e:02:d0,00:21:70:67:32:60,208.111.161.254,172.16.133.63,1161,20,55,80,54280,22282,1121,10164,6679,20,0.452733,0.000701,0x0018,2013-02-26 22:07:35.950711+00:00
633892,635016,00:21:70:67:32:60,00:90:7f:3e:02:d0,172.16.133.63,208.111.161.254,40,20,128,54280,80,22282,0,6679,11285,20,0.453596,0.000863,0x0010,2013-02-26 22:07:35.951574+00:00


### Labellisation by Suricata

In [68]:
SURICATA_EVE_LOG = DATA_OUTPUT + '/eve.json'

# delete existing eve.json file if it exists, suricata would append data otherwise
if os.path.isfile(SURICATA_EVE_LOG):
    os.remove(SURICATA_EVE_LOG)

In [69]:
# run Suricata and generate the log file in the EVE.JSON output file
cli="suricata -r " + PCAPFILE + " tcp " + " -l " + DATA_OUTPUT # + " -k none"

%time exit_code = os.system(cli)

if exit_code == 0:
    logger.info('Executed successfully *.pcap to EVE.json translation with suricata')
else:
    logger.error('Error while using suricata to analyse from *.pcap to EVE.json')
    raise NameError('Error while using suricata to analyse from *.pcap to EVE.json')

22/8/2023 -- 06:55:27 - <Notice> - This is Suricata version 6.0.10 RELEASE running in USER mode
22/8/2023 -- 06:55:27 - <Notice> - all 5 packet processing threads, 4 management threads initialized, engine started.
22/8/2023 -- 06:55:29 - <Notice> - Signal Received.  Stopping engine.
22/8/2023 -- 06:55:29 - <Notice> - Pcap-file module read 1 files, 633894 packets, 315855601 bytes
CPU times: user 4.75 ms, sys: 559 µs, total: 5.3 ms
Wall time: 2.72 s


In [70]:
# Pandas provides a useful method – json_normalize – for normalizing nested JSON fields into dataframe. Resulting columns use dot notation to signify nested objects, similar to how Elasticsearch does it

with open (SURICATA_EVE_LOG) as packets:
    df_log = pd.json_normalize(
        [json.loads(packet) for packet in packets],
        max_level=1
    )
    
df_log

Unnamed: 0,timestamp,flow_id,pcap_cnt,event_type,src_ip,src_port,dest_ip,dest_port,proto,anomaly.type,...,stats.decoder,stats.flow,stats.defrag,stats.flow_bypassed,stats.tcp,stats.detect,stats.app_layer,stats.http,stats.ftp,stats.file_store
0,2013-02-26T23:02:36.044728+0100,4.410381e+14,100.0,anomaly,98.138.19.88,80.0,172.16.133.132,44296.0,TCP,stream,...,,,,,,,,,,
1,2013-02-26T23:02:36.046200+0100,4.410381e+14,111.0,anomaly,172.16.133.132,44296.0,98.138.19.88,80.0,TCP,stream,...,,,,,,,,,,
2,2013-02-26T23:02:35.988964+0100,1.337649e+15,56.0,anomaly,172.16.133.113,55582.0,172.16.139.250,5462.0,TCP,stream,...,,,,,,,,,,
3,2013-02-26T23:02:35.988966+0100,1.337649e+15,57.0,anomaly,172.16.133.113,55582.0,172.16.139.250,5462.0,TCP,stream,...,,,,,,,,,,
4,2013-02-26T23:02:35.989963+0100,1.337649e+15,59.0,anomaly,172.16.133.113,55582.0,172.16.139.250,5462.0,TCP,stream,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40289,2013-02-26T23:07:35.938089+0100,9.850709e+14,,flow,172.16.133.95,57175.0,172.16.139.250,5440.0,TCP,,...,,,,,,,,,,
40290,2013-02-26T23:07:35.938089+0100,1.829517e+15,,flow,172.16.133.60,62146.0,208.98.232.43,80.0,TCP,,...,,,,,,,,,,
40291,2013-02-26T23:07:35.938089+0100,1.688782e+15,,flow,172.16.133.75,51172.0,172.16.139.250,5440.0,TCP,,...,,,,,,,,,,
40292,2013-02-26T23:07:35.938089+0100,5.628971e+14,,flow,172.16.133.66,54118.0,23.45.65.25,80.0,TCP,,...,,,,,,,,,,


In [71]:
# extract anomalies out of the whole log, put it in a specific dataframe
df_anomaly = df_log[df_log['event_type']=='anomaly']

# post-process : convert timestamp to UTC-datetime, set index
df_anomaly['DateTime'] = pd.to_datetime(df_anomaly['timestamp'],utc=True)
df_anomaly.drop(columns=['timestamp'], inplace=True)
df_anomaly.set_index(keys='DateTime', drop=False, inplace=True)

df_anomaly

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_anomaly['DateTime'] = pd.to_datetime(df_anomaly['timestamp'],utc=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_anomaly.drop(columns=['timestamp'], inplace=True)


Unnamed: 0_level_0,flow_id,pcap_cnt,event_type,src_ip,src_port,dest_ip,dest_port,proto,anomaly.type,anomaly.event,...,stats.flow,stats.defrag,stats.flow_bypassed,stats.tcp,stats.detect,stats.app_layer,stats.http,stats.ftp,stats.file_store,DateTime
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-02-26 22:02:36.044728+00:00,4.410381e+14,100.0,anomaly,98.138.19.88,80.0,172.16.133.132,44296.0,TCP,stream,stream.fin_but_no_session,...,,,,,,,,,,2013-02-26 22:02:36.044728+00:00
2013-02-26 22:02:36.046200+00:00,4.410381e+14,111.0,anomaly,172.16.133.132,44296.0,98.138.19.88,80.0,TCP,stream,stream.fin_but_no_session,...,,,,,,,,,,2013-02-26 22:02:36.046200+00:00
2013-02-26 22:02:35.988964+00:00,1.337649e+15,56.0,anomaly,172.16.133.113,55582.0,172.16.139.250,5462.0,TCP,stream,stream.fin_but_no_session,...,,,,,,,,,,2013-02-26 22:02:35.988964+00:00
2013-02-26 22:02:35.988966+00:00,1.337649e+15,57.0,anomaly,172.16.133.113,55582.0,172.16.139.250,5462.0,TCP,stream,stream.fin_but_no_session,...,,,,,,,,,,2013-02-26 22:02:35.988966+00:00
2013-02-26 22:02:35.989963+00:00,1.337649e+15,59.0,anomaly,172.16.133.113,55582.0,172.16.139.250,5462.0,TCP,stream,stream.rst_but_no_session,...,,,,,,,,,,2013-02-26 22:02:35.989963+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-02-26 22:07:35.938089+00:00,1.923169e+15,,anomaly,108.170.194.161,80.0,172.16.133.163,3669.0,TCP,applayer,UNABLE_TO_MATCH_RESPONSE_TO_REQUEST,...,,,,,,,,,,2013-02-26 22:07:35.938089+00:00
2013-02-26 22:07:35.938089+00:00,1.635773e+15,,anomaly,174.129.214.80,80.0,172.16.133.66,54275.0,TCP,applayer,UNABLE_TO_MATCH_RESPONSE_TO_REQUEST,...,,,,,,,,,,2013-02-26 22:07:35.938089+00:00
2013-02-26 22:07:35.938089+00:00,2.492979e+14,,anomaly,174.129.214.80,80.0,172.16.133.66,54278.0,TCP,applayer,UNABLE_TO_MATCH_RESPONSE_TO_REQUEST,...,,,,,,,,,,2013-02-26 22:07:35.938089+00:00
2013-02-26 22:07:35.938089+00:00,1.346545e+14,,anomaly,132.245.2.22,443.0,172.16.133.113,55695.0,TCP,stream,stream.reassembly_seq_gap,...,,,,,,,,,,2013-02-26 22:07:35.938089+00:00


In [72]:
# idée : 
# 1. parcourir les DateTime du sous-ensemble des anomalies détectées par Suricata
# 2. regarder s'il y a un paquet avec ce timestamp exact dans l'extraction tshark
# 3. si oui : flagger y=1 le paquet dans la df tshark (et rajouter les champs d'explication)
# 4. si non : logger une anomalie orpheline

In [73]:
df = df_raw.copy()
df = df.set_index(keys='DateTime', drop=False)
df['y'] = 0  # set number of anomalies per *.pcap packet

df

Unnamed: 0_level_0,frame.number,eth.src,eth.dst,ip.src_host,ip.dst_host,ip.len,ip.hdr_len,ip.ttl,tcp.srcport,tcp.dstport,tcp.stream,tcp.len,tcp.seq,tcp.ack,tcp.hdr_len,tcp.time_relative,tcp.time_delta,tcp.flags,DateTime,y
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2013-02-26 22:02:35.953494+00:00,1,00:90:7f:3e:02:d0,00:21:70:63:3b:ad,96.43.146.176,172.16.133.82,40,20,242,443,61228,0,0,1,1,20,0.000000,0.000000,0x0010,2013-02-26 22:02:35.953494+00:00,0
2013-02-26 22:02:35.954479+00:00,2,00:90:7f:3e:02:d0,00:21:70:63:3b:ad,96.43.146.176,172.16.133.82,40,20,242,443,61228,0,0,1,1107,20,0.000985,0.000985,0x0010,2013-02-26 22:02:35.954479+00:00,0
2013-02-26 22:02:35.958435+00:00,3,00:90:7f:3e:02:d0,00:21:70:63:3b:ad,96.43.146.176,172.16.133.82,40,20,241,443,60073,1,0,1,1,20,0.000000,0.000000,0x0010,2013-02-26 22:02:35.958435+00:00,0
2013-02-26 22:02:35.959911+00:00,4,00:21:70:67:6a:e7,00:90:7f:3e:02:d0,172.16.133.103,216.115.222.200,52,20,128,63406,443,2,0,0,0,32,0.000000,0.000000,0x0002,2013-02-26 22:02:35.959911+00:00,0
2013-02-26 22:02:35.962163+00:00,5,00:21:70:63:3c:4a,00:90:7f:3e:02:d0,172.16.133.43,172.16.139.250,251,20,128,57700,5440,3,211,1,1,20,0.000000,0.000000,0x0018,2013-02-26 22:02:35.962163+00:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-02-26 22:07:35.949374+00:00,635013,00:90:7f:3e:02:d0,00:21:70:67:32:60,208.111.161.254,172.16.133.63,40,20,55,80,54278,22280,0,7383,5566,20,0.452463,0.024845,0x0010,2013-02-26 22:07:35.949374+00:00,0
2013-02-26 22:07:35.950010+00:00,635014,00:90:7f:3e:02:d0,00:21:70:67:32:60,208.111.161.254,172.16.133.63,1500,20,55,80,54280,22282,1460,8704,6679,20,0.452032,0.026218,0x0010,2013-02-26 22:07:35.950010+00:00,0
2013-02-26 22:07:35.950711+00:00,635015,00:90:7f:3e:02:d0,00:21:70:67:32:60,208.111.161.254,172.16.133.63,1161,20,55,80,54280,22282,1121,10164,6679,20,0.452733,0.000701,0x0018,2013-02-26 22:07:35.950711+00:00,0
2013-02-26 22:07:35.951574+00:00,635016,00:21:70:67:32:60,00:90:7f:3e:02:d0,172.16.133.63,208.111.161.254,40,20,128,54280,80,22282,0,6679,11285,20,0.453596,0.000863,0x0010,2013-02-26 22:07:35.951574+00:00,0


In [74]:
ctr_anomalies = 0

for anomaly_datetime in df_anomaly.index:
    df.loc[anomaly_datetime, 'y'] = df.loc[anomaly_datetime, 'y'] + 1
    ctr_anomalies += 1
    
print(f'compté {ctr_anomalies} anomalies')
print(f"assigned {df['y'].sum()} anomalies")

df['y'].unique()  # Il peut y avoir plusieurs anomalies par paquet

compté 3926 anomalies
assigned 4073 anomalies


array([ 1,  0,  2,  4, 10])

In [75]:
# export final en *.csv

DF_OUTPUT_DIR = '/home/benjamin/Folders_Python/Cyber/data/dataframes/'
DF_FILENAME = DF_OUTPUT_DIR + 'df_' + DFNAME + '.csv'

with open(DF_FILENAME, 'w') as f:
    df.to_csv(f)