In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Imports

In [None]:
import numpy as np
import pandas as pd
import time
from tqdm import tqdm

In [None]:
from waad.heuristics.H1.ip_clustering import IPClustering
from waad.heuristics.H1.ip_processing import IPProcessing

from waad.utils.anomalous_asset import ComputeAnomalousAssets
from waad.utils.asset import IP, Machine
from waad.utils.data import Data
from waad.utils.fait_notable import ComputeFaitNotablesFromIndicators, ComputeFaitNotablesFromRareIPv6, ComputeFaitNotablesFromAnalystTuplesAnalysers, FaitNotable
from waad.utils.indicators import ComputeIndicators, Indicators
from waad.utils.kmeans_clustering import KmeansClustering
from waad.utils.postgreSQL_utils import Database, Table
from waad.utils.rule import Link, Probability, Relation, Rule
from waad.utils.tuples_analyser import AnalystTuplesAnalyser

# Pipeline

### Variables

In [None]:
faits_notables = []

### Load IP addresses

In [None]:
HOST = '127.0.0.1' 
PORT = '5432'
USER = ''   # To fill
PASSWORD = ''   # To fill
DB_NAME = ''   # To fill
TABLE_NAME = ''  # To fill

In [None]:
db = Database(HOST, PORT, USER, PASSWORD, DB_NAME)
table = Table(db, TABLE_NAME)

ips = table.get_command(f"SELECT DISTINCT ipaddress FROM {table.table_name};")['ipaddress'].values

### 0.1

In [None]:
ipp = IPProcessing(ips=ips)
ipp.run()

### 0.3.1

In [None]:
public_ips = ipp.public_ipv4s + ipp.public_ipv6s

### 0.3.2

In [None]:
private_ips = ipp.private_ipv4s + ipp.private_ipv6s

### 0.2 & 0.4

In [None]:
cfnfripv6 = ComputeFaitNotablesFromRareIPv6(ipp.ipv6s, ipp.ipv4s)
cfnfripv6.run()
faits_notables.extend(cfnfripv6.faits_notables)

---------------

### 1.1.1

* On cherche à identifier s'il existe un sous-réseau d'adresses publiques communiquant avec les machines du parc

In [None]:
public_ipc = IPClustering(ipp.public_ipv4s, min_samples=3)
public_ipc.run()
public_ipc.plot_clusters()

* Requete toutes les IP publiques et calcule des tables de résumé

In [None]:
public_ips_summary = {}
public_ips_4624_summary = {}
public_ips_4624_authentications = {}

for ip in tqdm(public_ips):
#     public_ip_authentications = table.get_field_filtered_on_value('IpAddress', 'pub_' + ip.exploded)
    public_ip_authentications = table.get_field_filtered_on_value('IpAddress', ip.exploded)
    public_ips_summary[ip.exploded] = Data.compute_window_summary(public_ip_authentications)

    if ip.exploded in public_ipc.clusters[None]:
        authentications_4624 = Data.filter_dataframe_field_on_value(public_ip_authentications, 'eventid', 4624)
        if authentications_4624.shape[0] != 0:
            public_ips_4624_authentications[ip.exploded] = authentications_4624
            public_ips_4624_summary[ip.exploded] = Data.compute_window_summary(authentications_4624)

public_ips_summary = pd.DataFrame(public_ips_summary).T
public_ips_4624_summary = pd.DataFrame(public_ips_4624_summary).T

In [None]:
public_ips_4624_summary

Dans la partie suivante, on applique l'heuristique décrite par l'analyste dans son document, c'est à dire l'analyse sur les regroupements d'événements rares uniquement sur les 4624.

### 1.1.2

In [None]:
tuples_analysers = {}

for ip in tqdm(public_ips_4624_summary.index):
    ata = AnalystTuplesAnalyser(public_ips_4624_authentications[ip], exploratory_fields=['computertype', 'authenticationpackagename', 'logonprocessname', 'workstationname', 'logontype', 'processname', 'host'])
    ata.run()
    tuples_analysers[IP(ip)] = ata

In [None]:
cfnfta = ComputeFaitNotablesFromAnalystTuplesAnalysers(tuples_analysers=tuples_analysers)
cfnfta.run()
faits_notables.extend(cfnfta.faits_notables)

Si des groupes d'événements rares sont repérés parmi les IP, on les considère comme faits notables

-----------------------

### 1.2

#### 1.2.1

Look for abnormal behavior of a private IP (which could be an entry point or a transit point for an attacker)

* Retrieve all private IP data and compute a summary table out of this. Also compute 3 interesting indicators over time (`nb_computers_reached`, `nb_authentications` and `nb_new_computers`) 

In [None]:
rule = Rule(
    relation=Relation(link=Link.SE_CONNECTE_SUR, probability=Probability.CERTAIN),
    conditions=[
        {
            'pre_filters': {'eventid': 4624},
            'filter_function': lambda row: row['ipaddress'].startswith('priv_') and row['host'] != '?',
            'asset_1': lambda row: IP(row['ipaddress']),
            'asset_2': lambda row: Machine(name=row['host'].split('.')[0], domain=row['host'].split('.')[1]),
        }
    ]
)

In [None]:
start = time.time()
ci = ComputeIndicators(table=table, rule=rule, indicator_objects=[Indicators.NB_AUTHENTICATIONS.value, Indicators.NB_ASSETS_REACHED.value, Indicators.NB_NEW_ASSETS_REACHED.value])
ci.run()
print(time.time() - start)

#### 1.2.2

In [None]:
cfnfi = ComputeFaitNotablesFromIndicators(ci.indicators)
cfnfi.run()
faits_notables.extend(cfnfi.faits_notables)

#### 1.2.3

Look for abnormal names on private IPs (which could be directly the IP of an attacker). We apply a clustering to group IP by sub-networks in order to isolate outlier names.

In [None]:
private_ipc = IPClustering(ipp.private_ipv4s)
private_ipc.run()
private_ipc.plot_clusters();

### 2. Faits notables

In [None]:
caa = ComputeAnomalousAssets(faits_notables)
caa.run()

In [None]:
caa.get_summary().head(20)

In [None]:
for aa in caa.anomalous_assets[:5]:
    aa.display();