# Scratch file to analyze netflow records

In [None]:
import pandas as pd
import ipaddr
import csv
import collections

from analyzer import Flow, Alert, Analyzer, _FLOW_FIELDS

In [None]:
analyzer = Analyzer()

with open("data.csv", "r") as csvfile:
    fin = csv.reader(csvfile)
    for e in fin:
        flow = Flow.from_csv(e)
        analyzer.process(flow)
    
    for alert in analyzer.alerts:
        print(alert.name)
        print("\n".join("\t{}".format(e) for e in alert.evidence))

In [None]:
#for chunk in pd.read_csv("data.csv", chunksize=10**8):
#    print(chunk.head())

# Data exploration with pandas (full chunk)

In [None]:
df_orig = pd.read_csv("data.csv", names=_FLOW_FIELDS, parse_dates=['ts'])

In [None]:
df_orig[ df_orig['src_ip']=="179.126.22.176" ].head()

In [None]:
df_orig['state'].unique()

## data info and counts

In [None]:
print(len(df_orig))
df_orig['ip_protocol'].unique()

In [None]:
df_orig.groupby('ip_protocol').count()

In [None]:
df_orig.groupby('ip_protocol').sum()

In [None]:
len(df_orig[df_orig['src_tx']==0])

In [None]:
len(df_orig[df_orig['dst_tx']==0])

In [None]:
len(df_orig[(df_orig['src_tx']==0) & (df_orig['dst_tx']==0)])

### group by stats

In [None]:
df_orig.groupby('dst_port')['state'].count().sort_values(ascending=False)

### Group by flow

In [None]:
gp = df_orig.groupby(['src_ip', 'src_port', 'dst_ip', 'dst_port', 'ip_protocol'])

In [None]:
df1 = gp[['src_tx', 'dst_tx']].sum()
df1['total'] = df1.sum(axis=1)

In [None]:
df1.sort_values(by=['total', 'dst_tx', 'src_tx'], ascending=False)

In [None]:
len( df_orig['dst_port'].unique() )

In [None]:
len( df_orig['src_port'].unique() )

### number of ports used

In [None]:
df_orig.groupby( ['src_ip'] )['dst_port'].unique().apply(lambda x: len(x)).mean()

In [None]:
import numpy as np

np.median( df_orig.groupby( ['src_ip'] )['dst_port'].unique().apply(lambda x: len(x)) )

In [None]:
df_orig.groupby( ['src_ip'] )['dst_port'].unique().apply(lambda x: len(x)).median()

In [None]:
df_orig.groupby( ['dst_ip'] )['dst_port'].unique().apply(lambda x: len(x)).mean()

In [None]:
df_orig.groupby( ['dst_ip'] )['dst_port'].unique().apply(lambda x: len(x)).median()

In [None]:
%matplotlib inline
df_orig.groupby( ['dst_ip'] )['dst_port'].unique().apply(lambda x: len(x)).hist(bins = 200)

In [None]:
df_orig.groupby( ['dst_ip'] )['dst_port'].unique().apply(lambda x: len(x)).sort_values(ascending=False)

In [None]:
df_orig.groupby( ['src_ip'] )['dst_port'].unique().apply(lambda x: len(x)).hist(bins=1000)