# Unsupervised learning

### The goal of the notebook is to research some models x features sets out of *.pcaps

In [1]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging
import collections
import seaborn as sns

import pyshark
import networkx as nx

# PATH change to access library cyberlib
import sys
sys.path.append('/home/benjamin/Folders_Python/Cyber/libs')
import cyberlib as cbl

In [2]:
# logging set-up

lg = cbl.GetLogger('/home/benjamin/Folders_Python/Cyber/logs/unsupervised_learning.log')
logger = lg.get_custom_logger()

# start your engine
logger.info("-------- new run --------")

### Get the *.pcap to play with

In [3]:
PCAPFILE = '/home/benjamin/Folders_Python/Cyber/data/input_pcaps/test.pcap'

In [4]:
capture = pyshark.FileCapture(
    input_file=PCAPFILE,
    use_ek=False
)

logger.info(f'-- created a capture object in PyShark with pcap file = {PCAPFILE} --')

In [5]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

NB=5

for id in range(NB):
    p = cbl.PyPacket(capture[id])
    pp.pprint(p.data)
    print('-------------')

{   'ETH': {   'dst': '00:1f:f3:3c:e1:13',
               'src': 'f8:1e:df:e5:84:3a',
               'type': '0x00000800'},
    'IP': {   'dst': '74.125.19.17',
              'flags': '0x00000040',
              'hdr_len': '20',
              'id': '0x0000de53',
              'len': '79',
              'proto': '6',
              'src': '172.16.11.12',
              'ttl': '64',
              'version': '4'},
    'TCP': {   'ack': '1',
               'dstport': '443',
               'flags': '0x00000018',
               'hdr_len': '32',
               'len': '27',
               'payload': '15:03:01:00:16:43:1a:88:1e:fa:7a:bc:22:6e:e6:32:7a:53:47:00:a7:5d:cc:64:ea:8e:92',
               'seq': '1',
               'srcport': '64565',
               'stream': '0',
               'time_delta': '0.000000000',
               'time_relative': '0.000000000'},
    'TIMESTAMP': {'ts': datetime.datetime(2010, 7, 7, 5, 16, 19, 466743)}}
-------------
{   'ETH': {   'dst': '00:1f:f3:3c:e1:13',
   

### Choice of features

We will look at Ethernet packets part of a TCP conversation

Features at TCP level : source port, destination port, sequence number, acknowledgement number, flags, header length, total length, time delta since last packet, time delta since first packet

Features at IP level : source IP, destination IP, flags, header length, length, identification, ttl, version

Features at ETH level : None

In [12]:
df = cbl.GetTCPDataframeFromFileCapture(filecapture=capture).dataframe

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   ETH_dst            106 non-null    int64         
 1   ETH_src            106 non-null    int64         
 2   IP_id              106 non-null    int64         
 3   IP_flags           106 non-null    int64         
 4   IP_src             106 non-null    int64         
 5   IP_dst             106 non-null    int64         
 6   TCP_flags          106 non-null    int64         
 7   IP_version         106 non-null    float64       
 8   IP_hdr_len         106 non-null    float64       
 9   IP_len             106 non-null    float64       
 10  IP_ttl             106 non-null    float64       
 11  IP_proto           106 non-null    float64       
 12  TCP_srcport        106 non-null    float64       
 13  TCP_dstport        106 non-null    float64       
 14  TCP_stream