- DBSCAN、HDBSCAN等方法將資料分群
- 觀察分群參數及結果
- 進一步的資料標記

In [1]:
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import joblib
from sklearn.metrics import euclidean_distances
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import hdbscan
from sortedcontainers import SortedDict

from upsetplot import UpSet, from_indicators

from utils.DefaultValue import *

pd.set_option('display.max_columns', None)

## Load Data

### Load from Pickle Object

In [2]:
df_flow1 = pd.read_pickle("../dataframe/tag_20230101_20230131.pkl")
df_flow2 = pd.read_pickle("../dataframe/tag_20230201_20230228.pkl")
df_flow = pd.concat([df_flow1, df_flow2])

df_flow

Unnamed: 0,packets_sent,bytes_sent,packets_recv,bytes_recv,packets_sent_to_target,bytes_sent_to_target,avg_packets_sent_size,packets_recv_from_target,bytes_recv_from_target,avg_packets_recv_size,packets_sent_to_port,bytes_sent_to_port,packets_recv_from_port,bytes_recv_from_port,distinct_to,distinct_ports,n_entries_to_target,distinct_ports_to_target,n_entries_to_port,Src_IP,Dst_IP,Src_Port,Dst_Port,Protocol,Duration,First_Seen,Last_Seen,botnets,exploits,tor,phishing,ransomware,malware,spam,cryptomining,scanner
0,2,92,2,92,2,92,46.000000,1,60,60.000000,2,92,1,60,1,1,1,1,1,193.163.125.121,140.123.105.255,54378,443.0,6,0.0,2022-12-31 16:10:47,2022-12-31 16:10:47,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1,60,1,60,1,60,60.000000,2,92,46.000000,1,60,2,92,1,1,1,1,1,140.123.105.255,193.163.125.121,443,54378.0,6,0.0,2022-12-31 16:10:47,2022-12-31 16:10:47,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,98,4805,56,2744,56,2744,49.000000,48,3448,71.833333,98,4805,78,5740,2,1,8,1,8,141.98.11.22,140.123.101.180,51755,25.0,6,2.0,2022-12-31 16:11:13,2022-12-31 16:11:15,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,98,4805,56,2744,56,2744,49.000000,48,3448,71.833333,98,4805,78,5740,2,1,8,1,8,141.98.11.22,140.123.101.180,51755,25.0,6,0.0,2022-12-31 16:11:13,2022-12-31 16:11:13,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,98,4805,56,2744,56,2744,49.000000,48,3448,71.833333,98,4805,78,5740,2,1,8,1,8,141.98.11.22,140.123.101.180,60103,25.0,6,0.0,2022-12-31 16:26:50,2022-12-31 16:26:50,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87870,18,1213,18,1213,18,1213,67.388889,16,4444,277.750000,18,1213,16,4444,1,1,11,1,11,140.123.105.234,204.11.56.48,5014,80.0,6,0.0,2023-02-28 15:41:31,2023-02-28 15:41:31,,,,,,,,,
87871,18,1213,18,1213,18,1213,67.388889,16,4444,277.750000,18,1213,16,4444,1,1,11,1,11,140.123.105.234,204.11.56.48,5014,80.0,6,0.0,2023-02-28 15:42:16,2023-02-28 15:42:16,,,,,,,,,
87872,18,1213,18,1213,18,1213,67.388889,16,4444,277.750000,18,1213,16,4444,1,1,11,1,11,140.123.105.234,204.11.56.48,5014,80.0,6,0.0,2023-02-28 15:43:00,2023-02-28 15:43:00,,,,,,,,,
87873,18,1213,18,1213,18,1213,67.388889,16,4444,277.750000,18,1213,16,4444,1,1,11,1,11,140.123.105.234,204.11.56.48,5014,80.0,6,0.0,2023-02-28 15:43:45,2023-02-28 15:43:45,,,,,,,,,


In [3]:
for tag in MALICIOUS_TAG:
    print(tag, df_flow[tag].sum())

botnets 146261.0
exploits 75973.0
tor 3894.0
phishing 106113.0
ransomware 0.0
malware 160420.0
spam 9287.0
cryptomining 261.0
scanner 153876.0


### Sample

In [4]:
# tag_name = 'exploits'

# df_flow = df_flow[df_flow[tag_name] == 1].dropna()

if df_flow.shape[0] > 75000:
    df_flow = df_flow.sample(75000, random_state=42)
else:
    df_flow = df_flow

df_flow

Unnamed: 0,packets_sent,bytes_sent,packets_recv,bytes_recv,packets_sent_to_target,bytes_sent_to_target,avg_packets_sent_size,packets_recv_from_target,bytes_recv_from_target,avg_packets_recv_size,packets_sent_to_port,bytes_sent_to_port,packets_recv_from_port,bytes_recv_from_port,distinct_to,distinct_ports,n_entries_to_target,distinct_ports_to_target,n_entries_to_port,Src_IP,Dst_IP,Src_Port,Dst_Port,Protocol,Duration,First_Seen,Last_Seen,botnets,exploits,tor,phishing,ransomware,malware,spam,cryptomining,scanner
108956,6,276,1,46,1,46,46.00000,0,0,0.000000,5,230,1,46,5,2,1,1,1,167.248.133.134,140.123.104.97,61788,443.0,6,0.0,2023-01-30 14:29:26,2023-01-30 14:29:26,,,,,,,,,
46296,321,14766,1,46,1,46,46.00000,0,0,0.000000,321,14766,34,1564,321,1,1,1,1,185.94.111.1,140.123.101.158,0,0.0,1,0.0,2023-02-15 21:55:01,2023-02-15 21:55:01,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
18850,1431,65826,1,46,1,46,46.00000,0,0,0.000000,1431,65826,548,27092,1431,1,1,1,1,146.88.240.4,140.123.105.251,54201,443.0,6,0.0,2023-01-04 18:09:11,2023-01-04 18:09:11,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3382,1463,68016,1,46,1,46,46.00000,0,0,0.000000,1463,68016,556,27502,1459,1,1,1,1,146.88.240.4,140.123.255.215,57475,443.0,6,0.0,2023-01-01 18:00:59,2023-01-01 18:00:59,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
12162,4,184,1,46,1,46,46.00000,1,46,46.000000,1,46,1,46,4,2,1,1,1,167.248.133.135,140.123.102.108,64087,8085.0,6,0.0,2023-01-03 10:12:39,2023-01-03 10:12:39,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39581,443,20378,1,46,1,46,46.00000,0,0,0.000000,443,20378,54,2582,417,1,1,1,1,154.89.5.94,140.123.103.134,58914,443.0,6,0.0,2023-02-14 00:11:09,2023-02-14 00:11:09,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
31467,81,3976,42,2057,42,2057,48.97619,30,2292,76.400000,81,3976,73,5275,2,1,6,1,6,141.98.11.22,140.123.101.188,56154,25.0,6,7.0,2023-02-12 18:55:17,2023-02-12 18:55:24,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
96170,1,46,44,2024,1,46,46.00000,1,46,46.000000,1,46,1,46,1,1,1,1,44,140.123.101.32,185.94.111.1,0,0.0,1,0.0,2023-01-26 10:07:02,2023-01-26 10:07:02,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
83574,1,46,1,46,1,46,46.00000,1,46,46.000000,1,46,1,46,1,1,1,1,1,140.123.102.108,154.89.5.94,8085,58914.0,6,0.0,2023-02-27 05:27:16,2023-02-27 05:27:16,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


## DBSCAN

### DBSCAN Setting

In [5]:
db = DBSCAN(
    eps=10000,
    min_samples=max(30, int(df_flow.shape[0] * 0.004)),
    metric='precomputed',
    algorithm='brute',
    leaf_size=30,
    n_jobs=10
)

In [6]:
db.get_params()

{'algorithm': 'brute',
 'eps': 10000,
 'leaf_size': 30,
 'metric': 'precomputed',
 'metric_params': None,
 'min_samples': 300,
 'n_jobs': 10,
 'p': None}

### Fit

In [7]:
D = euclidean_distances(
    df_flow[FEATURE_LIST].to_numpy(),
    df_flow[FEATURE_LIST].to_numpy()
)
#print(df_flow[NEW_FEATURE_LIST])
print(FEATURE_LIST)
#print(D.shape)

['packets_sent', 'bytes_sent', 'packets_recv', 'bytes_recv', 'packets_sent_to_target', 'bytes_sent_to_target', 'avg_packets_sent_size', 'packets_recv_from_target', 'bytes_recv_from_target', 'avg_packets_recv_size', 'packets_sent_to_port', 'bytes_sent_to_port', 'packets_recv_from_port', 'bytes_recv_from_port', 'distinct_to', 'distinct_ports', 'n_entries_to_target', 'distinct_ports_to_target', 'n_entries_to_port']


In [8]:
db_labels = db.fit_predict(D)
print(db_labels)
# -1 代表outlier

MemoryError: Unable to allocate 39.1 GiB for an array with shape (69884, 75000) and data type float64

### Analysis

#### Cluster數量、大小

In [None]:
print(f'Number of labels: {max(db_labels) + 1}')
print(f'Outlier: {list(db_labels).count(-1) / len(db_labels) * 100:.02f}%')
for i in range(0, max(db_labels) + 1):
    print(f'Cluster {i}: {len(db_labels[db_labels==i]) / len(db_labels) * 100:.02f}%')

### Label分類

In [None]:

# out_dir = f'cluster/'
# if not os.path.isdir(out_dir):
#     os.makedirs(out_dir)
# dict_df_flow = df_flow.to_dict('split')

# cluster_dict = {}
# for idx, label in enumerate(db_labels):
#     if label not in cluster_dict:
#         cluster_dict[label] = []
#     cluster_dict[label].append(dict_df_flow['data'][idx])

# result_1 = pd.DataFrame([cluster_dict[-1]])
# result_2 = pd.DataFrame([cluster_dict[0]])

# for i in cluster_dict:
#     df = pd.DataFrame(cluster_dict[i], columns = ['packets_sent', 'bytes_sent', 'packets_recv', 'bytes_recv', 'packets_sent_to_target', 'bytes_sent_to_target', 'avg_packets_sent_size',
#                                      'packets_recv_from_target', 'bytes_recv_from_target', 'avg_packets_recv_size', 'packets_sent_to_port', 'bytes_sent_to_port',
#                                      'packets_recv_from_port', 'bytes_recv_from_port', 'distinct_to', 'distinct_ports', 'n_entries_to_target', 'distinct_ports_to_target',
#                                      'n_entries_to_port', 'Src_IP', 'Dst_IP', 'Src_Port', 'Dst_Port', 'Protocol', 'Duration', 'First_Seen', 'Last_Seen',
#                                      'botnets', 'exploits',	'tor', 'phishing', 'ransomware', 'malware', 'spam', 'cryptomining', 'scanner'])
#     df.drop(['botnets', 'exploits', 'tor', 'phishing', 'ransomware', 'malware', 'spam', 'cryptomining', 'scanner'], axis=1, inplace = True)
#     df.to_csv(f"{out_dir}/cluster{i}.csv", encoding='utf-8', index=False)



In [None]:
out_dir = f'cluster2/'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
dict_df_flow = df_flow.to_dict('split')

cluster_dict = {}
for idx, label in enumerate(db_labels):
    if label not in cluster_dict:
        cluster_dict[label] = []
    cluster_dict[label].append(dict_df_flow['data'][idx])

#result_1 = pd.DataFrame([cluster_dict[-1]])
#result_2 = pd.DataFrame([cluster_dict[0]])
index=False

#create output_folder
out_folder = f"{BASE_PATH}same_ip_diff_behavior2"
if not os.path.isdir(out_folder):
    os.mkdir(out_folder)


## outer dicr, key is distinct ip
ip_dict = SortedDict()

csv_out_folder = f"{BASE_PATH}cluster2"
if not os.path.isdir(csv_out_folder):
    os.mkdir(csv_out_folder)
for i in cluster_dict: #i is dictionary number
    #malicious_ip = set()
    index=False
    df = pd.DataFrame(cluster_dict[i], columns = ['packets_sent', 'bytes_sent', 'packets_recv', 'bytes_recv', 'packets_sent_to_target', 'bytes_sent_to_target', 'avg_packets_sent_size',
                                     'packets_recv_from_target', 'bytes_recv_from_target', 'avg_packets_recv_size', 'packets_sent_to_port', 'bytes_sent_to_port',
                                     'packets_recv_from_port', 'bytes_recv_from_port', 'distinct_to', 'distinct_ports', 'n_entries_to_target', 'distinct_ports_to_target',
                                     'n_entries_to_port', 'Src_IP', 'Dst_IP', 'Src_Port', 'Dst_Port', 'Protocol', 'Duration', 'First_Seen', 'Last_Seen',
                                     'botnets', 'exploits', 'tor', 'phishing', 'ransomware', 'malware', 'spam', 'cryptomining', 'scanner'])
    df.drop(['botnets', 'exploits', 'tor', 'phishing', 'ransomware', 'malware', 'spam', 'cryptomining', 'scanner'], axis=1, inplace = True)
    # transform pandas to .csv and store it. 
    df.to_csv(f"{csv_out_folder}/cluster{i}.csv", encoding='utf-8')


    #build dict for distinct ip
    #print(df[["Src_IP", "First_Seen"]])
    dict_num = i
    #columns_of_interest = ['First_Seen']
    for index, row in df.iterrows():
        ip = row['Src_IP']
        time = row['First_Seen']
        #print(row['Src_IP'], row['First_Seen'])
        prefix = ip.split('.')
        prefix = ".".join(prefix[:2])
        if(prefix == "140.123"):
            continue

        if not ip_dict.__contains__(ip):
            ip_dict[ip] = SortedDict();
            ip_dict[ip].update({time:dict_num})
        else:
            ip_dict[ip].update({time:dict_num})


    #columns_of_interest = ['First_Seen']
    for index, row in df[["Dst_IP", "First_Seen"]].iterrows():
        ip = row['Dst_IP']
        time = row['First_Seen']
        prefix = ip.split('.')
        prefix = ".".join(prefix[:2])
        if(prefix == "140.123"):
            continue

        if not ip_dict.__contains__(ip):
            ip_dict[ip] = SortedDict();
            ip_dict[ip].update({time:dict_num})
        else:
            ip_dict[ip].update({time:dict_num})


#write out
for ip, cluster_dict in ip_dict.items():
    diff_value_map = {};
    for time, cluster_id in cluster_dict.items():
        if cluster_id not in diff_value_map:
            diff_value_map[cluster_id] = 1
    if len(diff_value_map) == 1:
        continue;
    with open(f"{out_folder}/{ip}.txt", 'w') as out_file:
        for time, cluster_id in cluster_dict.items():
            print(f'{time}: {cluster_id}', file=out_file)

del ip_dict


#### 每個cluster的tag狀況

In [None]:
db_tags = dict()
for tag in MALICIOUS_TAG:
    db_tags[tag] = list()

for i in range(0, max(db_labels) + 1):
    df = df_flow[db_labels==i]

    for tag in MALICIOUS_TAG:
        db_tags[tag].append(df[tag].sum())

    print(f"Cluster {i} total: {df[MALICIOUS_TAG].shape[0]}")
    print(df[MALICIOUS_TAG].sum())
    print()

#### 每個tag的資料，曾經出現在哪幾個cluster

In [None]:
for tag in MALICIOUS_TAG:
    print(f"{tag}: {[index for index, value in enumerate(db_tags[tag]) if value > 0]}")

#### outlier狀況

In [None]:
for tag in MALICIOUS_TAG:
    print(f"Tag {tag} total: {df_flow[tag].sum()}")
    print(df_flow[db_labels==-1][tag].sum())
    print((df_flow[db_labels==-1][tag].sum()/df_flow[tag].sum()) * 100 )

#### 出現哪些IP

In [None]:
df_flow[(db_labels==1)].Src_IP.unique()

#### Cluster採樣

In [None]:
df_flow[(db_labels==2) & (df_flow.malware==1) & (df_flow.phishing==1) & (df_flow.scanner==1)]

In [None]:
# sample = X_train[(~X_train.Src_Port.isin(p)) & (~X_train.Dst_Port.isin(p)) & (db_labels==4)]
# sample = X_train[~((X_train.Dst_IP=='151.139.128.11') | (X_train.Src_IP=='151.139.128.11'))]
sample = df_flow[(db_labels==2)]
sample