## IP based grouping

In [None]:
import pandas as pd
import ipaddress
import numpy as np
from analyzer import _FLOW_FIELDS

In [None]:
df_orig = pd.read_csv("data.csv", names=_FLOW_FIELDS, parse_dates=['ts'])

In [None]:
df_orig.head()

## Features

In [None]:
grouped = df_orig.set_index('ts').groupby(['src_ip', 'dst_ip']).resample('10$')

In [None]:
df = grouped['src_tx'].sum().to_frame()

In [None]:
df['bytes_dw'] = grouped['dst_tx'].sum()
df = df.rename(columns={'src_tx': 'bytes_dw'})

In [None]:
df['num_conns'] = grouped['src_port'].count()

In [None]:
df['num_flows'] = grouped([['dst_port', 'src_port', 'ip_protocol']]).agg(lambda x: len(set(x))).max(axis=1)

In [None]:
df_orig[(df_orig['src_ip']=="10.2.1.20") & (df_orig['dst_ip']=="10.12.0.31") &
        (df_orig['ts']<=pd.datetime.fromisoformat("2017-01-27 16:47:20")) & (df_orig['ts']>=pd.datetime.fromisoformat("2017-01-27 16:47:10"))]

In [None]:
df['num_dst_port'] = grouped['dst_port'].agg(lambda x: len(set(x)))
df['num_src_port'] = grouped['src_port'].agg(lambda x: len(set(x)))

In [None]:
from collections import Counter

temp = grouped['ip_protocol'].agg(lambda x: Counter(x))

In [None]:
df['tcp_conns'] = temp.apply(lambda x: x['tcp'] if 'tcp' in x else 0)
df['udp_conns'] = temp.apply(lambda x: x['udp'] if 'udp' in x else 0)

In [None]:
import ipaddress

df2 = df.reset_index()

df2['cidr_src_ip'] = df2['src_ip'].apply(lambda x: str(ipaddress.ip_network(x)))
df2['cidr_dst_ip'] = df2['dst_ip'].apply(lambda x: str(ipaddress.ip_network(x)))
df2['pvt_src_ip'] = df2['src_ip'].apply(lambda x: ipaddress.IPv4Address(x).is_private)
df2['pvt_dst_ip'] = df2['dst_ip'].apply(lambda x: ipaddress.IPv4Address(x).is_private)

In [None]:
# all values are counts, so fillna with zeros
df2 = df2.fillna(0)

In [None]:
df2.to_pickle("df_src_dst_sampled_10s.pkl")

In [None]:
features = ['bytes_dw', 'bytes_dw', 'num_conns',
    'num_flows', 'num_dst_port', 'num_src_port', 'tcp_conns', 'udp_conns',
    'cidr_src_ip', 'cidr_dst_ip', 'pvt_src_ip', 'pvt_dst_ip']
features_min = ['bytes_dw', 'bytes_dw', 'num_conns',
    'num_flows', 'num_dst_port', 'num_src_port', 'tcp_conns', 'udp_conns']

In [None]:
from matplotlib import pyplot as plt
from sklearn import decomposition
from sklearn.preprocessing import normalize, StandardScaler

In [None]:
df2 = pd.read_pickle("df_src_dst_sampled_10s.pkl")

X = df2.loc[:, features_min].values

### PCA

In [None]:
pca = decomposition.PCA(n_components=6)
#X_centered = X - X.mean(axis=0)

X_std = StandardScaler().fit_transform(X)
pca.fit(X_std)

X_pca = pca.transform(X_std)

print(pca.explained_variance_)
for i, component in enumerate(pca.components_):
    print("{} component: {}% of initial variance".format(i + 1, 
        round(100 * pca.explained_variance_ratio_[i], 2)))

In [None]:
# Plotting the results of PCA
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

ax.plot(X_pca[:, 0], X_pca[:, 1], '+')
ax.set_xlabel('PCA 0')
ax.set_ylabel('PCA 1')
ax.grid(1)

### KMEANS

In [None]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

In [None]:
X_std = StandardScaler().fit_transform(X)

clt = KMeans(n_clusters=8, random_state=0, n_jobs=-1)
model = clt.fit(X_std)
print( model.labels_ )
pred_y = model.fit_predict(X_std)

In [None]:
plt.scatter(X_std[:,0], X_std[:,1], c=pred_y, cmap='plasma')
#plt.scatter(model.cluster_centers_[:, 0], model.cluster_centers_[:, 1], s=300, c='red')
plt.show()

In [None]:
distortions = []

for k in range(1,11):
    clt = KMeans(n_clusters=k, random_state=0, n_jobs=-1)
    model = clt.fit(X_std)
    print( model.labels_ )
    pred_y = model.fit_predict(X_std)

    distortions.append(sum(np.min(cdist(X_std, model.cluster_centers_, 'euclidean'), axis=1)) / X_std.shape[0])

In [None]:
plt.plot(range(1,11), distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')