## Import *.pcap par tshark, labellisaton par Suricata, ML

### Set-Up

In [1]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging
import collections
import seaborn as sns
import pprint
import os
# import ipywidgets
# import warnings

# import pyshark
# import networkx as nx

# from sklearn.preprocessing import OrdinalEncoder, StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans, DBSCAN
# from sklearn.manifold import TSNE
# from sklearn.metrics import pairwise_distances, silhouette_score
# from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
# import umap

# from itertools import product

# PATH change to access library cyberlib
import sys
sys.path.append('/home/benjamin/Folders_Python/Cyber/libs')
import cyberlib as cbl

# to allow PyShark to run in Jupyter notebooks
# import nest_asyncio
# nest_asyncio.apply()

In [2]:
# logging set-up

lg = cbl.GetLogger('/home/benjamin/Folders_Python/Cyber/logs/pcap_labellisation.log')
logger = lg.get_custom_logger()

# start your engine
logger.info("-------- new run --------")

### Import *pcap by tshark, export to *.csv then DataFrame

In [3]:
# which *.pcap

DFNAME = 'smallFlows'

PCAPFILE = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/' + DFNAME + '.pcap'

In [17]:
# We use the tshark CLI to parse the *.pcap file and output a *.csv file for pandas
# doc here : https://www.wireshark.org/docs/man-pages/tshark.html

# tshark :
# -r                :   reads the *.pcap
# -2, -R "tcp"      :   does 2 passes and keeps packets part of TCP conversations only
# -T fields         :   outputs a file with fields
# -E header=y       :   keeps the fields names on first row for pd.read_csv
# -E separator=,    :   for *.csv format
# -e <fields>       :   desired output fields
# -o                :   formats of the data in the fields

cli="tshark -r " + PCAPFILE + """ -2 \
    -R "tcp" \
    -T fields -E header=y -E separator=, \
    -e _ws.col.Time -t ad \
    -e frame.number \
    -e eth.src -e eth.dst \
    -e ip.src_host -e ip.dst_host \
    -e ip.len -e ip.hdr_len -e ip.ttl \
    -e tcp.srcport -e tcp.dstport -e tcp.stream -e tcp.len \
    -e tcp.seq -e tcp.ack -e tcp.hdr_len -e tcp.time_relative \
    -e tcp.time_delta \
    -e tcp.flags \
    -o 'gui.column.format:"No","%m","Time","%t","Source","%s","Destination","%d","Protocol","%p","Length","%L","Info","%i"' \
    > ~/Folders_Python/Cyber/data/input_pcaps/to_csv/test.csv"""

exit_code = os.system(cli)

if exit_code == 0:
    logger.info('Executed successfully *.pcap to *.csv translation with tshark')
else:
    logger.error('Error while using tshark to translate from *.pcap to *.csv')
    raise NameError('Error while using tshark to translate from *.pcap to *.csv')

In [18]:
filename = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/to_csv/test.csv'

with open(file=filename, encoding='utf-8') as f:
    df_raw = pd.read_csv(
        f,
        header=0,               # using first row as columns names. they are exported by tshark -E header=y
        on_bad_lines='warn'     # if a line does not have the right length, skip it but warn me
        )

In [19]:
df_raw

Unnamed: 0,_ws.col.Time,frame.number,eth.src,eth.dst,ip.src_host,ip.dst_host,ip.len,ip.hdr_len,ip.ttl,tcp.srcport,tcp.dstport,tcp.stream,tcp.len,tcp.seq,tcp.ack,tcp.hdr_len,tcp.time_relative,tcp.time_delta,tcp.flags
0,2011-01-25 19:52:22.484409,1,40:61:86:9a:f1:f5,00:1a:8c:15:f9:80,192.168.3.131,72.14.213.138,983,20,128,57011,80,0,943,1,1,20,0.000000,0.000000,0x0018
1,2011-01-25 19:52:22.514250,2,00:1a:8c:15:f9:80,40:61:86:9a:f1:f5,72.14.213.138,192.168.3.131,426,20,52,80,57011,0,386,1,944,20,0.029841,0.029841,0x0018
2,2011-01-25 19:52:22.708292,3,40:61:86:9a:f1:f5,00:1a:8c:15:f9:80,192.168.3.131,72.14.213.102,52,20,128,55950,80,1,0,0,0,32,0.000000,0.000000,0x0002
3,2011-01-25 19:52:22.713832,4,40:61:86:9a:f1:f5,00:1a:8c:15:f9:80,192.168.3.131,72.14.213.138,40,20,128,57011,80,0,0,944,387,20,0.229423,0.199582,0x0010
4,2011-01-25 19:52:22.727058,5,00:1a:8c:15:f9:80,40:61:86:9a:f1:f5,72.14.213.102,192.168.3.131,52,20,52,80,55950,1,0,0,1,32,0.018766,0.018766,0x0012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13703,2011-01-25 19:57:20.768701,13704,08:00:27:cc:3f:1b,52:54:00:12:35:02,10.0.2.15,65.55.15.244,40,20,128,2537,5480,407,0,5039,5738,20,71.195375,66.560501,0x0014
13704,2011-01-25 19:57:20.768769,13705,08:00:27:cc:3f:1b,52:54:00:12:35:02,10.0.2.15,207.46.105.186,40,20,128,2540,5480,409,0,398,93,20,70.606228,5.540471,0x0014
13705,2011-01-25 19:57:20.768861,13706,08:00:27:cc:3f:1b,52:54:00:12:35:02,10.0.2.15,96.17.8.49,40,20,128,2547,5480,419,0,496,8189,20,64.405045,64.259982,0x0014
13706,2011-01-25 19:57:20.768911,13707,08:00:27:cc:3f:1b,52:54:00:12:35:02,10.0.2.15,91.103.140.2,40,20,128,2546,5480,417,0,525,270,20,64.884164,64.357688,0x0014


In [22]:
# # ordinal encoding with Pandas

# columns_to_encode_as_ordinal = ['frame.number', 'eth.src', 'eth.dst', 'ip.src_host', 'ip.dst_host', 'tcp.flags']

# df_ord = pd.DataFrame()
# for c in columns_to_encode_as_ordinal:
#     codes, _ = pd.factorize(df_raw[c])
#     df_sup = pd.DataFrame(data={ c : list(codes) })
#     df_ord = pd.concat([df_ord, df_sup], axis=1)
    
# df = df_raw.drop(columns=columns_to_encode_as_ordinal)
# df.reset_index(drop=True)

# df = pd.concat([df, df_ord], axis=1)

# # columns_to_drop = ['TIMESTAMP_ts']
# # df.drop(columns=columns_to_drop, inplace=True)

In [23]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13708 entries, 0 to 13707
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   _ws.col.Time       13708 non-null  object 
 1   frame.number       13708 non-null  int64  
 2   eth.src            13708 non-null  object 
 3   eth.dst            13708 non-null  object 
 4   ip.src_host        13708 non-null  object 
 5   ip.dst_host        13708 non-null  object 
 6   ip.len             13708 non-null  int64  
 7   ip.hdr_len         13708 non-null  int64  
 8   ip.ttl             13708 non-null  int64  
 9   tcp.srcport        13708 non-null  int64  
 10  tcp.dstport        13708 non-null  int64  
 11  tcp.stream         13708 non-null  int64  
 12  tcp.len            13708 non-null  int64  
 13  tcp.seq            13708 non-null  int64  
 14  tcp.ack            13708 non-null  int64  
 15  tcp.hdr_len        13708 non-null  int64  
 16  tcp.time_relative  137

### Labellisation by Suricata