In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import time
from tqdm import tqdm

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 150)
pd.set_option('display.max_rows', 500)

# Imports

In [None]:
from waad.heuristics.H1.ip_processing import IPProcessing

from waad.utils.data import Data
from waad.utils.kmeans_clustering import KmeansClustering
from waad.utils.postgreSQL_utils import Database, Table

# Pipeline

### Load IP addresses

In [None]:
HOST = '127.0.0.1' 
PORT = '5432'
USER = ''   # To fill
PASSWORD = ''   # To fill
DB_NAME = ''   # To fill
TABLE_NAME = ''  # To fill

In [None]:
db = Database(HOST, PORT, USER, PASSWORD, DB_NAME)
table = Table(db, TABLE_NAME)

# KMeans

In [None]:
ips = table.get_command(f"SELECT DISTINCT ipaddress FROM {table.table_name}").ipaddress.values

In [None]:
ipp = IPProcessing(ips=ips)
ipp.run()

In [None]:
public_ips_summary = {}

for ip in tqdm(ipp.public_ipv4s + ipp.public_ipv6s):
#     public_ip_authentications = table.get_field_filtered_on_value('IpAddress', 'pub_' + ip.exploded)
    public_ip_authentications = table.get_field_filtered_on_value('IpAddress', ip.exploded)
    public_ips_summary[ip.exploded] = Data.compute_window_summary(public_ip_authentications)

public_ips_summary = pd.DataFrame(public_ips_summary).T

In [None]:
public_ips_summary['delta_seconds'] = public_ips_summary['delta'].apply(lambda x: x.total_seconds())  

In [None]:
kmc = KmeansClustering(public_ips_summary['delta_seconds'].values)
kmc.run() 

In [None]:
kmc.plot_silhouette_score()

In [None]:
kmc.plot_clusters() 

# Multiprocessing test on Cache building

Reduces computations duration by at least 3 on my machine (4 cores - 8 threads Intel core i7) 

Use the next blocks to build cache after adtree in tuples big data notebook

In [None]:
from waad.utils.combinations_utils import custom_combinations_generator
import multiprocessing as mp

CACHE = {}

def build_cache(comb):
    contab = ContingencyTable([e + 1 for e in comb], adtree)
    return comb, contab.get_table()

def initialize_cache(adtree):
    CACHE[1] = {}
    for i in tqdm(range(len(META_FIELDS)), file=sys.stdout):
        contab = ContingencyTable([i + 1], adtree)
        CACHE[1][tuple([i])] = contab.get_table()

def add_new_cache_layer():
    import multiprocessing as mp

    m = max(CACHE.keys())
    CACHE[m + 1] = {}
    
    pool = mp.Pool(mp.cpu_count(), maxtasksperchild=50)
    res = pool.map(build_cache, custom_combinations_generator(list(range(len(META_FIELDS))), length=m + 1))
    pool.close()

    for comb, contab in res:
        CACHE[m + 1][comb] = contab

In [None]:
start = time.time()
initialize_cache(adtree)

for k in range(2, MAX_LAYER + 1):
    print(f"Build cache layer {k}")
    add_new_cache_layer()
print(time.time() - start)

## Koalas

```
$ pip install pyspark
$ pip install koalas
```

In [None]:
import databricks.koalas as ks
import os
from waad.utils.combinations_utils import flatten

In [None]:
# Set option to 'distributed' computation
ks.set_option('compute.default_index_type', 'distributed')

In [None]:
start = time.time()
# Path to the folder with all csvs
PATH = '/waad/data/dataset_suite/'
data = ks.concat([ks.read_csv(PATH + csv) for csv in os.listdir(PATH)])
print(time.time() - start)

Experience lazy operations

In [None]:
data.replace({None: '?'}, inplace=True)

In [None]:
META_FIELDS = [('SubjectUserSid', 'SubjectUserName', 'SubjectDomainName'),
 ('TargetUserSid', 'TargetUserName', 'TargetDomainName'),
 ('TargetServerName', 'TargetInfo'),
 ('FailureReason', 'Status', 'SubStatus'),
 'LogonType',
 'EventID',
 'IpAddress',
 'LogonProcessName',
 ('AuthenticationPackageName', 'LmPackageName'),
 'WorkstationName',
 'Host'
]

In [None]:
cut = data[data['EventID'] == 4624][flatten(META_FIELDS)]

In [None]:
cut.head(10)

Define `Rule` to study

In [None]:
from waad.utils.asset import Account, Machine
from waad.utils.rule import Link, Probability, Relation, Rule

In [None]:
rule = Rule(
    relation=Relation(link=Link.SE_CONNECTE_SUR, probability=Probability.CERTAIN),
    conditions=[
        {
            'pre_filters': {'eventid': 4624},
            'filter_function': lambda row: row['EventID'] == '4624' and row['TargetUserSid'].startswith('S-1-5-21-') and row['Host'] != '?',
            'asset_1': lambda row: Account(sid=row['TargetUserSid']),
            'asset_2': lambda row: Machine(name=row['Host'].split('.')[0], domain=row['Host'].split('.')[1]),
        }
    ]
)

#### Intent to fill a global variable via a .apply()

In [None]:
CACHE = []

In [None]:
def f(row) -> int:
    CACHE.append(row['SystemTime'])
    return 0

In [None]:
res = data.apply(f, axis=1);

In [None]:
res.head(100);

In [None]:
CACHE

Cache is not updated because it is impossible to have a side effect on variables with koalas / pyspark --> limitation

#### Get another column via a .apply()

In [None]:
def f(row):
    condition = rule.conditions[0]
    if condition['filter_function'](row):
        asset_1 = condition['asset_1'](row)
        asset_2 = condition['asset_2'](row)
        return [str(asset_1), row['SystemTime'], str(asset_2)]

impossible to use custom objects in koalas...

In [None]:
res = data.apply(f, axis=1)

Unfortunately impossible to split directly the result of f() into 3 columns like in pandas so additional computation of r

In [None]:
r = ks.DataFrame(res[res.notnull()].values.tolist(), columns=['asset_1', 'systemtime', 'asset_2'])

In [None]:
t = r.groupby(by='asset_1')

In [None]:
t.get_group('None - None - S-1-5-21-93088677042eb579-e1f7ccce2a1dcc00')

In [None]:
t.apply(lambda x: ComputeIndicators.compute_indicators_over_time2(pd.DataFrame(x), indicators=[Indicators.NB_AUTHENTICATIONS.value]))

Bug because impossible to use custom objects in koalas...
--> Not useable right now, need more mature development ? Or serialization of all custom objects ?  