## IP based grouping

In [None]:
import pandas as pd
import ipaddress
import numpy as np
from analyzer import _FLOW_FIELDS

In [None]:
df_orig = pd.read_csv("data.csv", names=_FLOW_FIELDS, parse_dates=['ts'])

In [None]:
df_orig.head()

## Features

In [None]:
grouped = df_orig.set_index('ts').groupby(['src_ip', 'dst_ip']).resample('10$')

In [None]:
df = grouped['src_tx'].sum().to_frame()

In [None]:
df['bytes_dw'] = grouped['dst_tx'].sum()
df = df.rename(columns={'src_tx': 'bytes_dw'})

In [None]:
df['num_conns'] = grouped['src_port'].count()

In [None]:
df['num_flows'] = grouped([['dst_port', 'src_port', 'ip_protocol']]).agg(lambda x: len(set(x))).max(axis=1)

In [None]:
df_orig[(df_orig['src_ip']=="10.2.1.20") & (df_orig['dst_ip']=="10.12.0.31") &
        (df_orig['ts']<=pd.datetime.fromisoformat("2017-01-27 16:47:20")) & (df_orig['ts']>=pd.datetime.fromisoformat("2017-01-27 16:47:10"))]

In [None]:
df['num_dst_port'] = grouped['dst_port'].agg(lambda x: len(set(x)))
df['num_src_port'] = grouped['src_port'].agg(lambda x: len(set(x)))

In [None]:
from collections import Counter

temp = grouped['ip_protocol'].agg(lambda x: Counter(x))

In [None]:
df['tcp_conns'] = temp.apply(lambda x: x['tcp'] if 'tcp' in x else 0)
df['udp_conns'] = temp.apply(lambda x: x['udp'] if 'udp' in x else 0)

In [None]:
import ipaddress

df2 = df.reset_index()

df2['cidr_src_ip'] = df2['src_ip'].apply(lambda x: str(ipaddress.ip_network(x)))
df2['cidr_dst_ip'] = df2['dst_ip'].apply(lambda x: str(ipaddress.ip_network(x)))
df2['pvt_src_ip'] = df2['src_ip'].apply(lambda x: ipaddress.IPv4Address(x).is_private)
df2['pvt_dst_ip'] = df2['dst_ip'].apply(lambda x: ipaddress.IPv4Address(x).is_private)