In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Imports

In [3]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import time
from tqdm import tqdm

In [4]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 150)
pd.set_option('display.max_rows', 500)

In [5]:
from waad.utils.tuples_analyser import AnalystTuplesAnalyser
from waad.utils.constants import TupleAnalysisFields
from waad.utils.data import Data
from waad.utils.combinations_utils import flatten
from waad.utils.postgreSQL_utils import Database, Table
from waad.utils.tuples_big_data import Cache, ComputeMutualInfoOnGivenLevel, ComputeScoreOnGivenLevel

In [6]:
from ad_tree.array_record import ArrayRecord
from ad_tree.sparse_ADTree import ADNode
from ad_tree.iterated_tree_contingency_table import ContingencyTable

# Load data

### Prepare database

In [7]:
HOST = '127.0.0.1' 
PORT = '5432'
USER = ''   # To fill
PASSWORD = ''   # To fill
DB_NAME = ''   # To fill
TABLE_NAME = ''  # To fill

In [8]:
db = Database(HOST, PORT, USER, PASSWORD, DB_NAME)
table = Table(db, TABLE_NAME)

In [9]:
# Comment and uncomment the meta_fields you want to use

META_FIELDS = [('subjectusersid', 'subjectusername', 'subjectdomainname'),
 ('targetusersid', 'targetusername', 'targetdomainname'),
 ('targetservername', 'targetinfo'),
 ('failurereason', 'status', 'substatus'),
 'logontype',
 'eventid',
 'ipaddress',
 'logonprocessname',
 ('authenticationpackagename', 'lmpackagename'),
 'workstationname',
 'host'
]

### Example of request

In [10]:
number_lines = 2500000
data = table.get_command(f"SELECT {', '.join(['eventrecordid', 'systemtime'] + flatten(META_FIELDS))} FROM {table.table_name} WHERE eventid = 4624 FETCH FIRST {number_lines} ROWS ONLY;")

### Filtering bloc

In [11]:
# data = Data.filter_dataframe_field_on_value(data, 'logontype', 3)

### Pre-processing

In [12]:
N = data.shape[0]

**Build meta fields**

In [13]:
Data.build_meta_fields(data, META_FIELDS)

**Convert to categorical**

If we have data to 'categorise', an additional dictionnary parameter is needed : 

```
{
 'trade_usd': [0, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000], 
 'weight_kg': [0, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000]
}
```
It contains the different bins that we want to define on the columns to categorize.

In [14]:
converter = Data.set_as_categorical(data, META_FIELDS)

**Get records table in categorial format**

In [15]:
records_table = np.transpose([data[field].cat.codes for field in META_FIELDS]).tolist()

**Get arity list**

In [16]:
arity_list = [len(dict(enumerate(data[f].cat.categories))) for f in META_FIELDS]

In [17]:
# del data

# import gc
# gc.collect()

# AD Tree pipeline

### Building ADTree

In [None]:
start = time.time()

array_record = ArrayRecord(arity_list, records_table)

# build an AD-Tree with attribute list starts from the first attribute and for all the records
adtree = ADNode(1, record_nums=list(range(1, array_record.records_length + 1)), array_record=array_record)

print(time.time() - start)

### Build cache

In [19]:
CACHE = {}

In [20]:
MAX_LAYER = 4

In [None]:
cache = Cache(adtree, META_FIELDS, MAX_LAYER)
cache.run()

### Compute mutual information

In [22]:
LEVEL_OF_INTEREST = 2

In [23]:
cmiogl = ComputeMutualInfoOnGivenLevel(cache, LEVEL_OF_INTEREST)
cmiogl.run()

In [24]:
index_elbow = cmiogl.compute_index_elbow()
mu = cmiogl.get_mu(index_elbow)

In [None]:
cmiogl.plot_mutual_info(index_elbow=index_elbow);

In [26]:
MU = 0.1

pairings_to_keep = cmiogl.get_pairings_to_keep(MU)

In [None]:
cmiogl.get_last_n_eliminated(mu=MU, last_n=5, converter=converter)

### Scores computation 

In [28]:
T_ALPHA = 20

In [29]:
csogl = ComputeScoreOnGivenLevel(cache, LEVEL_OF_INTEREST, T_ALPHA, pairings_to_keep)
csogl.run()

### Scores visualization

In [None]:
csogl.get_firsts_abnormal_pairings(firsts_n=30, converter=converter)

In [31]:
csogl.save_firsts_abnormal_pairings(firsts_n=500, converter=converter, path='./firsts_abnormal_pairings.csv', min_card=5)

### Additional qualification

#### Most frequent subpairings

In [None]:
csogl.display_most_frequents_subpairings(index_number=1, firsts_n=20, converter=converter)

#### AnalystTuplesAnalyser

In [None]:
ata = AnalystTuplesAnalyser(csogl.get_corresponding_authentications(data, index_number=8, converter=converter), exploratory_fields=META_FIELDS)
ata.run()
if ata.candidate is not None:
    ata.candidate.display_centered_summary()
else:
    print("No candidate found with AnalystTuplesAnalyser for those parameters")