# Import libraries

In [2]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import MultiMatch
from elasticsearch_dsl import Q
from datetime import datetime
from datetime import timedelta
import pandas as pd
import numpy as np
import json
from collections import Counter
from itertools import product

# Create fuctions

In [3]:
def connect_ES():
    return Elasticsearch(hosts='192.168.70.102', port=9200,timeout=60)

In [4]:
# This fuction is used to get the data in pass 14 days
def get_exdata_pass_14_days():
    
    # To specify the time range for choosing index
    now = datetime.today()
    now_strft = datetime.strftime(now,"%Y-%m-%d")
    before_strft = datetime.strftime(now-timedelta(14),"%Y-%m-%d")
    now_year = now_strft[2:4]
    before_year = before_strft[2:4]
    now_month = now_strft[5:7]
    before_month = before_strft[5:7]
    inter_index = "ai-alert-internet-20"
    intra_index = "ai-alert-intranet-20"
    if now_year==before_year and now_month==before_month: 
        # If the pass time and today are in the same year and same month
        index_list = [inter_index+f'{now_year}-{now_month}',intra_index+f'{now_year}-{now_month}']
    elif now_year==before_year and now_month!=before_month: 
        # If the pass time and today are in the same year but in different month
        index_list = [inter_index+f'{now_year}-{now_month}',inter_index+f'{now_year}-{before_month}', \
                      intra_index+f'{now_year}-{before_month}',intra_index+f'{now_year}-{before_month}']
    elif now_year!=before_year and now_month!=before_month:
        # If the pass time and today are in both differnent year and different month
        index_list = [inter_index+f'{now_year}-{now_month}',inter_index+f'{before_year}-{before_month}', \
                      intra_index+f'{now_year}-{now_month}',intra_index+f'{before_year}-{before_month}']                
    else:
        print("There are some errors, please try again!")
        
    # Start grabing data
    es = connect_ES() # connect to elasticsearch
    search = Search(using=es, index=index_list) \
            .filter("range", ingest_timestamp = {"gt":f"now-14d/d", "lt":"now"}) \
            .extra(from_=0, size=10000)
    response = search.execute() #Execute the query
    response_list = response.hits.hits
    # Transform those data to dataframe so that we can get the data we need quickly
    data_list = []
    for i in response_list:
        data_list.append(i["_source"].to_dict())
    df = pd.DataFrame(data_list)
    df_clean = df[['alert_id','src_ip','dst_ip']]
    df_clean['dst_ip'] = df_clean['dst_ip'].apply(lambda s : [s] if type(s)!= type(list()) else s)
    return df_clean

In [5]:
def dst_ip_dataframe(df):
    dlist = [i for i in df]
    d_columns = [f'd{i}' for i in range(1,max([len(i) for i in dlist])+1)]
    di_df = pd.DataFrame(dlist,columns=d_columns)
    return di_df

In [12]:
def cross_search(order,choice):
    
    #取出前14天的資料
    df = get_exdata_pass_14_days()
    
    #前處理，將dst_ip展開，並drop掉原本dst_ip的column
    span_data = [(i[1],i[2],col_3) for i in df.itertuples() for col_3 in i[3]]
    df = pd.DataFrame(span_data,columns=df.columns)
    df_copy = df.copy()
    df.sort_values([i for i in order],inplace=True)
#     df_copy = df.copy()
    
    #開始進行search
    for o,c in zip(order,choice):
        if c!=None:  #有輸入值的狀況
            df = df.query(f"{o}=='{c}'")
        else:  #沒有輸入值的狀況(取前10)
            top10_lst = [i for i in df[o].value_counts()[:10].index]
            df = df[df[o].isin(top10_lst)]
        
    # 開始處理河流圖output
    col1_unique = list(set(df[df.columns[0]]))
    col2_unique = list(set(df[df.columns[1]]))
    col3_unique = list(set(df[df.columns[2]]))
    # 先處理前兩個column的links
    first_two = [{'source' : s, 'target': t, 'value': \
                 len(df.query(f"{df.columns[0]}=='{s}' and {df.columns[1]}=='{t}'"))} \
                 for s,t in product(col1_unique,col2_unique) \
                 if df.query(f"{df.columns[0]}=='{s}' and {df.columns[1]}=='{t}'").empty==False]
    # 再處理後兩個column的links
    last_two = [{'source' : s, 'target': t, 'value': \
                 len(df.query(f"{df.columns[1]}=='{s}' and {df.columns[2]}=='{t}'"))} \
                 for s,t in product(col2_unique,col3_unique) \
                 if df.query(f"{df.columns[1]}=='{s}' and {df.columns[2]}=='{t}'").empty==False]
    output = {'data': [{'name':str(i)} for i in np.unique(df_copy.values)],\
           'links': [first_two+last_two]}
    return output

## TEST

In [13]:
cross_search(order=['alert_id','src_ip','dst_ip'],choice=[None,None,None])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


{'data': [{'name': '192.168.70.70'},
  {'name': '192.168.70.71'},
  {'name': '192.168.70.72'},
  {'name': '192.168.70.73'},
  {'name': '192.168.70.74'},
  {'name': '192.168.70.75'},
  {'name': '192.168.70.76'},
  {'name': '192.168.70.77'},
  {'name': '192.168.70.78'},
  {'name': '192.168.70.79'},
  {'name': '192.168.70.80'},
  {'name': '192.168.70.81'},
  {'name': '192.168.70.82'},
  {'name': '192.168.70.83'},
  {'name': '192.168.70.84'},
  {'name': '192.168.70.85'},
  {'name': '192.168.70.86'},
  {'name': '192.168.70.87'},
  {'name': '192.168.70.88'},
  {'name': '192.168.70.89'},
  {'name': '192.168.70.90'},
  {'name': '192.168.70.91'},
  {'name': '192.168.70.92'},
  {'name': '192.168.70.93'},
  {'name': '192.168.70.94'},
  {'name': '192.168.70.95'},
  {'name': '192.168.70.96'},
  {'name': '192.168.70.97'},
  {'name': '192.168.70.98'},
  {'name': '192.168.70.99'},
  {'name': '8.8.8.0'},
  {'name': '8.8.8.1'},
  {'name': '8.8.8.2'},
  {'name': '8.8.8.3'},
  {'name': '8.8.8.4'},
  {'nam

In [319]:
a = [1,2,3,4]
for i in ai_si_df.index:
    print(len(ai_si_df.loc[i[0]]))

7
7
7
7
7
7
7
4
4
4
4
4
4
4
4
6
6
6
6
6
6
4
4
4
4
5
5
5
5
5
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4


### Get the data in pass 14 days

In [8]:
cs_df = get_exdata_pass_14_days()
# cs_df.sort_values(['alert_id','src_ip'],ignore_index=True)
cs_df

ConnectionError: ConnectionError(<urllib3.connection.HTTPConnection object at 0x000001463F5683C8>: Failed to establish a new connection: [WinError 10060] 連線嘗試失敗，因為連線對象有一段時間並未正確回應，或是連線建立失敗，因為連線的主機無法回應。) caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x000001463F5683C8>: Failed to establish a new connection: [WinError 10060] 連線嘗試失敗，因為連線對象有一段時間並未正確回應，或是連線建立失敗，因為連線的主機無法回應。)

### Data preprocessing

In [336]:
# untack the list in dst_ip column
span_data = [(i[1],i[2],col_3) for i in cs_df.itertuples() for col_3 in i[3]] 
span_data

[('T0912', '192.168.70.74', '8.8.8.7'),
 ('T0936', '192.168.70.76', '8.8.8.2'),
 ('T0922', '192.168.70.74', '8.8.8.7'),
 ('T0974', '192.168.70.70', '8.8.8.3'),
 ('T0948', '192.168.70.77', '8.8.8.3'),
 ('T0953', '192.168.70.78', '8.8.8.7'),
 ('T0995', '192.168.70.79', '8.8.8.5'),
 ('T0942', '192.168.70.72', '8.8.8.8'),
 ('T0976', '192.168.70.74', '8.8.8.7'),
 ('T0941', '192.168.70.75', '8.8.8.3'),
 ('T0989', '192.168.70.79', '8.8.8.6'),
 ('T0972', '192.168.70.79', '8.8.8.9'),
 ('T0935', '192.168.70.71', '8.8.8.1'),
 ('T0998', '192.168.70.78', '8.8.8.6'),
 ('T0990', '192.168.70.78', '8.8.8.2'),
 ('T0931', '192.168.70.77', '8.8.8.1'),
 ('T0917', '192.168.70.74', '8.8.8.3'),
 ('T0905', '192.168.70.78', '8.8.8.0'),
 ('T0940', '192.168.70.75', '8.8.8.0'),
 ('T0958', '192.168.70.75', '8.8.8.4'),
 ('T0988', '192.168.70.78', '8.8.8.8'),
 ('T0969', '192.168.70.78', '8.8.8.4'),
 ('T0956', '192.168.70.75', '8.8.8.6'),
 ('T0964', '192.168.70.74', '8.8.8.5'),
 ('T0929', '192.168.70.72', '8.8.8.9'),


In [337]:
cs_df = pd.DataFrame(span_data, columns=cs_df.columns)

In [300]:
ai_si_df = cs_df.sort_values(['alert_id','src_ip'],ignore_index=True)
ai_si_df

Unnamed: 0,alert_id,src_ip,dst_ip
0,T0800,192.168.70.95,"[192.168.70.87, 192.168.70.87, 192.168.70.86, ..."
1,T0801,192.168.70.97,"[192.168.70.87, 192.168.70.87, 192.168.70.87, ..."
2,T0802,192.168.70.95,"[192.168.70.87, 192.168.70.81, 192.168.70.86, ..."
3,T0802,192.168.70.99,"[192.168.70.82, 192.168.70.88, 192.168.70.82, ..."
4,T0803,192.168.70.93,"[192.168.70.85, 192.168.70.85, 192.168.70.89, ..."
5,T0803,192.168.70.95,"[192.168.70.89, 192.168.70.88, 192.168.70.81, ..."
6,T0803,192.168.70.95,"[192.168.70.80, 192.168.70.87, 192.168.70.80, ..."
7,T0803,192.168.70.96,"[192.168.70.89, 192.168.70.84, 192.168.70.80, ..."
8,T0806,192.168.70.92,"[192.168.70.89, 192.168.70.88, 192.168.70.81, ..."
9,T0808,192.168.70.99,"[192.168.70.86, 192.168.70.80, 192.168.70.83, ..."


In [190]:
ai_si_df.drop("alert_id",axis=1)

Unnamed: 0,src_ip,dst_ip
0,192.168.70.95,"[192.168.70.89, 192.168.70.89, 192.168.70.87, ..."
1,192.168.70.93,"[192.168.70.85, 192.168.70.86, 192.168.70.86, ..."
2,192.168.70.98,"[192.168.70.85, 192.168.70.86, 192.168.70.88, ..."
3,192.168.70.91,"[192.168.70.86, 192.168.70.86, 192.168.70.87, ..."
4,192.168.70.93,"[192.168.70.80, 192.168.70.81, 192.168.70.89, ..."
5,192.168.70.95,"[192.168.70.88, 192.168.70.86, 192.168.70.80, ..."
6,192.168.70.91,"[192.168.70.86, 192.168.70.83, 192.168.70.89, ..."
7,192.168.70.98,"[192.168.70.82, 192.168.70.84, 192.168.70.89, ..."
8,192.168.70.90,"[192.168.70.87, 192.168.70.82, 192.168.70.85, ..."
9,192.168.70.95,"[192.168.70.82, 192.168.70.82, 192.168.70.82, ..."


In [28]:
# Create 3 dataframe as 1)alert_id+src_ip, 2)alert_id+dst+ip, 3)src_ip+dst_ip

# ai_si_df = test_df[['alert_id','src_ip']]
# ai_di_df = test_df[['alert_id','dst_ip']]
# di_si_df = test_df[['src_ip','dst_ip']]

In [301]:
# Start doing condition filtering

#Case 1 : alert_id -> src_ip -> dst_ip
# ai_si_df
# ai_si_df = ai_si_df.sort_values(['alert_id','src_ip'])
top10ai_list = [i for i in ai_si_df['alert_id'].value_counts()[:10].index]
ai_si_df = ai_si_df[ai_si_df['alert_id'].isin(top10ai_list)]
ai_si_df = ai_si_df.set_index(['alert_id'],)
ai_si_df['num'] = ai_si_df.groupby(level=0).cumcount()+1
ai_si_df.set_index('num', append=True,inplace=True)
ai_si_df

Unnamed: 0_level_0,Unnamed: 1_level_0,src_ip,dst_ip
alert_id,num,Unnamed: 2_level_1,Unnamed: 3_level_1
T0814,1,192.168.70.90,"[192.168.70.85, 192.168.70.85, 192.168.70.89, ..."
T0814,2,192.168.70.93,"[192.168.70.80, 192.168.70.85, 192.168.70.81, ..."
T0814,3,192.168.70.94,"[192.168.70.88, 192.168.70.89, 192.168.70.85, ..."
T0814,4,192.168.70.95,"[192.168.70.81, 192.168.70.80, 192.168.70.88, ..."
T0814,5,192.168.70.96,"[192.168.70.86, 192.168.70.89, 192.168.70.85, ..."
T0814,6,192.168.70.97,"[192.168.70.85, 192.168.70.80, 192.168.70.87, ..."
T0814,7,192.168.70.97,"[192.168.70.87, 192.168.70.84, 192.168.70.87, ..."
T0817,1,192.168.70.93,"[192.168.70.82, 192.168.70.81, 192.168.70.83, ..."
T0817,2,192.168.70.95,"[192.168.70.84, 192.168.70.81, 192.168.70.83, ..."
T0817,3,192.168.70.98,"[192.168.70.83, 192.168.70.84, 192.168.70.89, ..."


In [305]:
ai_si_df.index

('T0814', 2)

In [134]:
top10si_list = [i for i in ai_si_df["src_ip"].value_counts()[:10].index]
ai_si_df = ai_si_df[ai_si_df["src_ip"].isin(top10si_list)]
ai_si_df

Unnamed: 0_level_0,Unnamed: 1_level_0,src_ip,dst_ip
alert_id,num,Unnamed: 2_level_1,Unnamed: 3_level_1
T0874,1,192.168.70.93,"[192.168.70.80, 192.168.70.85, 192.168.70.83, ..."
T0874,2,192.168.70.98,"[192.168.70.86, 192.168.70.88, 192.168.70.87, ..."
T0877,1,192.168.70.90,"[192.168.70.85, 192.168.70.89, 192.168.70.87, ..."
T0877,2,192.168.70.99,"[192.168.70.89, 192.168.70.85, 192.168.70.80, ..."
T0879,1,192.168.70.93,"[192.168.70.84, 192.168.70.83, 192.168.70.84, ..."
T0883,1,192.168.70.90,"[192.168.70.84, 192.168.70.81, 192.168.70.83, ..."
T0887,1,192.168.70.95,"[192.168.70.81, 192.168.70.83, 192.168.70.88, ..."
T0895,1,192.168.70.92,"[192.168.70.81, 192.168.70.82, 192.168.70.85, ..."
T0910,1,192.168.70.70,[8.8.8.5]
T0910,2,192.168.70.73,[8.8.8.2]


In [143]:
# si_di_df
top10si_list = list(ai_si_df['src_ip'].value_counts()[:10].index) # grab top 10 src_ip on the condition of top 10 alert_id
si_di_df = ai_si_df[ai_si_df['src_ip'].isin(top10si_list)]
si_di_df = si_di_df.sort_values(['src_ip'])
si_di_df.set_index(['src_ip'],inplace=True)
si_di_df['num'] = si_di_df.groupby(level=0).cumcount()+1
si_di_df.set_index('num', append=True,inplace=True)
si_di_df

Unnamed: 0_level_0,Unnamed: 1_level_0,dst_ip
src_ip,num,Unnamed: 2_level_1
192.168.70.70,1,[8.8.8.5]
192.168.70.73,1,[8.8.8.2]
192.168.70.76,1,[8.8.8.8]
192.168.70.79,1,[8.8.8.8]
192.168.70.79,2,[8.8.8.8]
192.168.70.90,1,"[192.168.70.85, 192.168.70.89, 192.168.70.87, ..."
192.168.70.90,2,"[192.168.70.84, 192.168.70.81, 192.168.70.83, ..."
192.168.70.92,1,"[192.168.70.81, 192.168.70.82, 192.168.70.85, ..."
192.168.70.93,1,"[192.168.70.80, 192.168.70.85, 192.168.70.83, ..."
192.168.70.93,2,"[192.168.70.84, 192.168.70.83, 192.168.70.84, ..."


In [181]:
n_df = dst_ip_dataframe(cs_df.dst_ip)
n_cs_df = cs_df.copy()
for i in list(n_df.columns):
    n_cs_df[i] = list(n_df[i])
n_cs_df

Unnamed: 0,alert_id,src_ip,dst_ip,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10
0,T0910,192.168.70.70,[8.8.8.5],8.8.8.5,,,,,,,,,
1,T0919,192.168.70.78,[8.8.8.2],8.8.8.2,,,,,,,,,
2,T0928,192.168.70.78,[8.8.8.6],8.8.8.6,,,,,,,,,
3,T0989,192.168.70.71,[8.8.8.9],8.8.8.9,,,,,,,,,
4,T0999,192.168.70.79,[8.8.8.8],8.8.8.8,,,,,,,,,
5,T0944,192.168.70.71,[8.8.8.6],8.8.8.6,,,,,,,,,
6,T0934,192.168.70.76,[8.8.8.9],8.8.8.9,,,,,,,,,
7,T0958,192.168.70.71,[8.8.8.5],8.8.8.5,,,,,,,,,
8,T0963,192.168.70.72,[8.8.8.9],8.8.8.9,,,,,,,,,
9,T0910,192.168.70.73,[8.8.8.2],8.8.8.2,,,,,,,,,


In [144]:
n_df = dst_ip_dataframe(si_di_df.dst_ip)
for i in list(n_df.columns):
    si_di_df[i] = list(n_df[i])

si_di_df

Unnamed: 0_level_0,Unnamed: 1_level_0,dst_ip,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10
src_ip,num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
192.168.70.70,1,[8.8.8.5],8.8.8.5,,,,,,,,,
192.168.70.73,1,[8.8.8.2],8.8.8.2,,,,,,,,,
192.168.70.76,1,[8.8.8.8],8.8.8.8,,,,,,,,,
192.168.70.79,1,[8.8.8.8],8.8.8.8,,,,,,,,,
192.168.70.79,2,[8.8.8.8],8.8.8.8,,,,,,,,,
192.168.70.90,1,"[192.168.70.85, 192.168.70.89, 192.168.70.87, ...",192.168.70.85,192.168.70.89,192.168.70.87,192.168.70.87,192.168.70.80,192.168.70.89,192.168.70.85,192.168.70.80,192.168.70.80,192.168.70.81
192.168.70.90,2,"[192.168.70.84, 192.168.70.81, 192.168.70.83, ...",192.168.70.84,192.168.70.81,192.168.70.83,192.168.70.89,192.168.70.88,192.168.70.86,192.168.70.80,192.168.70.82,192.168.70.85,192.168.70.81
192.168.70.92,1,"[192.168.70.81, 192.168.70.82, 192.168.70.85, ...",192.168.70.81,192.168.70.82,192.168.70.85,192.168.70.81,192.168.70.82,192.168.70.83,192.168.70.87,192.168.70.84,192.168.70.83,192.168.70.81
192.168.70.93,1,"[192.168.70.80, 192.168.70.85, 192.168.70.83, ...",192.168.70.80,192.168.70.85,192.168.70.83,192.168.70.82,192.168.70.81,192.168.70.82,192.168.70.82,192.168.70.85,192.168.70.85,192.168.70.80
192.168.70.93,2,"[192.168.70.84, 192.168.70.83, 192.168.70.84, ...",192.168.70.84,192.168.70.83,192.168.70.84,192.168.70.88,192.168.70.81,192.168.70.88,192.168.70.81,192.168.70.87,192.168.70.84,192.168.70.83


In [118]:
top10di_list = list(dst_ip_dataframe(si_di_df["dst_ip"]).value_counts()[:10].index)
top10di_list

[('192.168.70.80',
  '192.168.70.85',
  '192.168.70.83',
  '192.168.70.82',
  '192.168.70.81',
  '192.168.70.82',
  '192.168.70.82',
  '192.168.70.85',
  '192.168.70.85',
  '192.168.70.80'),
 ('192.168.70.81',
  '192.168.70.82',
  '192.168.70.85',
  '192.168.70.81',
  '192.168.70.82',
  '192.168.70.83',
  '192.168.70.87',
  '192.168.70.84',
  '192.168.70.83',
  '192.168.70.81'),
 ('192.168.70.81',
  '192.168.70.83',
  '192.168.70.88',
  '192.168.70.83',
  '192.168.70.81',
  '192.168.70.82',
  '192.168.70.86',
  '192.168.70.81',
  '192.168.70.83',
  '192.168.70.88'),
 ('192.168.70.84',
  '192.168.70.81',
  '192.168.70.83',
  '192.168.70.89',
  '192.168.70.88',
  '192.168.70.86',
  '192.168.70.80',
  '192.168.70.82',
  '192.168.70.85',
  '192.168.70.81'),
 ('192.168.70.84',
  '192.168.70.83',
  '192.168.70.84',
  '192.168.70.88',
  '192.168.70.81',
  '192.168.70.88',
  '192.168.70.81',
  '192.168.70.87',
  '192.168.70.84',
  '192.168.70.83'),
 ('192.168.70.85',
  '192.168.70.89',
  '192.

In [119]:
dst_ip_dataframe(si_di_df["dst_ip"])

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10
0,8.8.8.5,,,,,,,,,
1,8.8.8.2,,,,,,,,,
2,8.8.8.8,,,,,,,,,
3,8.8.8.8,,,,,,,,,
4,8.8.8.8,,,,,,,,,
5,192.168.70.85,192.168.70.89,192.168.70.87,192.168.70.87,192.168.70.80,192.168.70.89,192.168.70.85,192.168.70.80,192.168.70.80,192.168.70.81
6,192.168.70.84,192.168.70.81,192.168.70.83,192.168.70.89,192.168.70.88,192.168.70.86,192.168.70.80,192.168.70.82,192.168.70.85,192.168.70.81
7,192.168.70.81,192.168.70.82,192.168.70.85,192.168.70.81,192.168.70.82,192.168.70.83,192.168.70.87,192.168.70.84,192.168.70.83,192.168.70.81
8,192.168.70.80,192.168.70.85,192.168.70.83,192.168.70.82,192.168.70.81,192.168.70.82,192.168.70.82,192.168.70.85,192.168.70.85,192.168.70.80
9,192.168.70.84,192.168.70.83,192.168.70.84,192.168.70.88,192.168.70.81,192.168.70.88,192.168.70.81,192.168.70.87,192.168.70.84,192.168.70.83


In [106]:
# 處理dst_ip前10名

# ds_ip_1 = dst_ip_dataframe(si_di_df["dst_ip"])
di_list = []
for i in si_di_df["dst_ip"]:
    di_list+=i
print(di_list)
dict(sorted(dict(Counter(di_list)).items(), key=lambda item: item[1],reverse=True))

['8.8.8.5', '8.8.8.2', '8.8.8.8', '8.8.8.8', '8.8.8.8', '192.168.70.85', '192.168.70.89', '192.168.70.87', '192.168.70.87', '192.168.70.80', '192.168.70.89', '192.168.70.85', '192.168.70.80', '192.168.70.80', '192.168.70.81', '192.168.70.84', '192.168.70.81', '192.168.70.83', '192.168.70.89', '192.168.70.88', '192.168.70.86', '192.168.70.80', '192.168.70.82', '192.168.70.85', '192.168.70.81', '192.168.70.81', '192.168.70.82', '192.168.70.85', '192.168.70.81', '192.168.70.82', '192.168.70.83', '192.168.70.87', '192.168.70.84', '192.168.70.83', '192.168.70.81', '192.168.70.80', '192.168.70.85', '192.168.70.83', '192.168.70.82', '192.168.70.81', '192.168.70.82', '192.168.70.82', '192.168.70.85', '192.168.70.85', '192.168.70.80', '192.168.70.84', '192.168.70.83', '192.168.70.84', '192.168.70.88', '192.168.70.81', '192.168.70.88', '192.168.70.81', '192.168.70.87', '192.168.70.84', '192.168.70.83', '192.168.70.81', '192.168.70.83', '192.168.70.88', '192.168.70.83', '192.168.70.81', '192.168.

{'192.168.70.81': 12,
 '192.168.70.83': 11,
 '192.168.70.85': 9,
 '192.168.70.88': 9,
 '192.168.70.87': 8,
 '192.168.70.80': 8,
 '192.168.70.82': 8,
 '192.168.70.84': 7,
 '192.168.70.89': 5,
 '8.8.8.8': 3,
 '192.168.70.86': 3,
 '8.8.8.5': 1,
 '8.8.8.2': 1}

In [112]:
Counter(di_list).most_common(10)

[('192.168.70.81', 12),
 ('192.168.70.83', 11),
 ('192.168.70.85', 9),
 ('192.168.70.88', 9),
 ('192.168.70.87', 8),
 ('192.168.70.80', 8),
 ('192.168.70.82', 8),
 ('192.168.70.84', 7),
 ('192.168.70.89', 5),
 ('8.8.8.8', 3)]

In [111]:
[i[0] for i in Counter(di_list).most_common(10)]

['192.168.70.81',
 '192.168.70.83',
 '192.168.70.85',
 '192.168.70.88',
 '192.168.70.87',
 '192.168.70.80',
 '192.168.70.82',
 '192.168.70.84',
 '192.168.70.89',
 '8.8.8.8']

In [87]:
x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0}
dict(sorted(x.items(), key=lambda item: item[1]))
x.items()

dict_items([(1, 2), (3, 4), (4, 3), (2, 1), (0, 0)])

In [89]:
dict(Counter(count_item)).items()

dict_items([('8.8.8.5', 1), ('8.8.8.2', 1), ('8.8.8.8', 3), ('192.168.70.85', 3), ('192.168.70.84', 2), ('192.168.70.81', 3), ('192.168.70.80', 1), ('192.168.70.86', 1), ('192.168.70.89', 2), (None, 5), ('192.168.70.82', 1), ('192.168.70.83', 2), ('192.168.70.88', 1)])

In [244]:
ai_si_df['src_ip'].value_counts()[:10]

192.168.70.95    2
192.168.70.96    2
192.168.70.99    2
192.168.70.90    2
192.168.70.94    1
192.168.70.92    1
192.168.70.72    1
192.168.70.75    1
192.168.70.70    1
192.168.70.76    1
Name: src_ip, dtype: int64

In [197]:
# ai_si_df['src_ip'].value_counts()[:10]
criterion = ai_si_df['src_ip'].isin(list(ai_si_df['src_ip'].value_counts()[:10].index))
ai_si_df[criterion]

Unnamed: 0_level_0,Unnamed: 1_level_0,src_ip
alert_id,subindex,Unnamed: 2_level_1
T0802,1,192.168.70.95
T0825,1,192.168.70.95
T0825,2,192.168.70.96
T0825,3,192.168.70.99
T0842,1,192.168.70.94
T0842,2,192.168.70.96
T0876,1,192.168.70.90
T0876,2,192.168.70.99
T0883,1,192.168.70.90
T0895,1,192.168.70.92


In [176]:
ai_si_df.loc['T0825']

Unnamed: 0_level_0,src_ip
subindex,Unnamed: 1_level_1
1,192.168.70.95
2,192.168.70.96
3,192.168.70.99


In [22]:
dst_ip = test_df['dst_ip'].apply(lambda s : [s] if type(s)!= type(list()) else s)
dlist = [i for i in dst_ip]
dlist

[['192.168.70.89',
  '192.168.70.89',
  '192.168.70.87',
  '192.168.70.85',
  '192.168.70.82',
  '192.168.70.80',
  '192.168.70.82',
  '192.168.70.89',
  '192.168.70.80',
  '192.168.70.83'],
 ['192.168.70.85',
  '192.168.70.86',
  '192.168.70.86',
  '192.168.70.89',
  '192.168.70.87',
  '192.168.70.89',
  '192.168.70.80',
  '192.168.70.84',
  '192.168.70.87',
  '192.168.70.87'],
 ['192.168.70.85',
  '192.168.70.86',
  '192.168.70.88',
  '192.168.70.89',
  '192.168.70.81',
  '192.168.70.88',
  '192.168.70.88',
  '192.168.70.80',
  '192.168.70.88',
  '192.168.70.87'],
 ['192.168.70.86',
  '192.168.70.86',
  '192.168.70.87',
  '192.168.70.83',
  '192.168.70.87',
  '192.168.70.89',
  '192.168.70.85',
  '192.168.70.81',
  '192.168.70.87',
  '192.168.70.84'],
 ['192.168.70.80',
  '192.168.70.81',
  '192.168.70.89',
  '192.168.70.87',
  '192.168.70.84',
  '192.168.70.81',
  '192.168.70.89',
  '192.168.70.80',
  '192.168.70.81',
  '192.168.70.81'],
 ['192.168.70.88',
  '192.168.70.86',
  '192.

In [23]:
# dst_ip = cs_df['dst_ip'].apply(if_list)
# dlist = [i for i in dst_ip]
# dlist
d_columns = [f'd{i}' for i in range(1,max([len(i) for i in dlist])+1)]
di_df = pd.DataFrame(dlist,columns=d_columns)
di_df

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10
0,192.168.70.89,192.168.70.89,192.168.70.87,192.168.70.85,192.168.70.82,192.168.70.80,192.168.70.82,192.168.70.89,192.168.70.80,192.168.70.83
1,192.168.70.85,192.168.70.86,192.168.70.86,192.168.70.89,192.168.70.87,192.168.70.89,192.168.70.80,192.168.70.84,192.168.70.87,192.168.70.87
2,192.168.70.85,192.168.70.86,192.168.70.88,192.168.70.89,192.168.70.81,192.168.70.88,192.168.70.88,192.168.70.80,192.168.70.88,192.168.70.87
3,192.168.70.86,192.168.70.86,192.168.70.87,192.168.70.83,192.168.70.87,192.168.70.89,192.168.70.85,192.168.70.81,192.168.70.87,192.168.70.84
4,192.168.70.80,192.168.70.81,192.168.70.89,192.168.70.87,192.168.70.84,192.168.70.81,192.168.70.89,192.168.70.80,192.168.70.81,192.168.70.81
5,192.168.70.88,192.168.70.86,192.168.70.80,192.168.70.85,192.168.70.81,192.168.70.87,192.168.70.87,192.168.70.83,192.168.70.84,192.168.70.86
6,192.168.70.86,192.168.70.83,192.168.70.89,192.168.70.87,192.168.70.87,192.168.70.84,192.168.70.80,192.168.70.81,192.168.70.86,192.168.70.84
7,192.168.70.82,192.168.70.84,192.168.70.89,192.168.70.85,192.168.70.89,192.168.70.80,192.168.70.84,192.168.70.80,192.168.70.85,192.168.70.85
8,192.168.70.87,192.168.70.82,192.168.70.85,192.168.70.80,192.168.70.88,192.168.70.87,192.168.70.83,192.168.70.88,192.168.70.81,192.168.70.84
9,192.168.70.82,192.168.70.82,192.168.70.82,192.168.70.84,192.168.70.83,192.168.70.82,192.168.70.89,192.168.70.80,192.168.70.87,192.168.70.86


In [184]:
type(di_df.iloc[2][2])

NoneType

In [76]:

cs_df[cs_df['src_ip'].isin([i for i in cs_df['src_ip'].value_counts()[:10].index])][['alert_id','src_ip']]

Unnamed: 0,alert_id,src_ip
0,T0905,192.168.70.75
1,T0901,192.168.70.72
2,T0990,192.168.70.72
4,T0996,192.168.70.75
12,T0811,192.168.70.98
13,T0837,192.168.70.90
14,T0860,192.168.70.97
15,T0848,192.168.70.95
16,T0825,192.168.70.95
17,T0879,192.168.70.93


In [75]:
list(cs_df['src_ip'].value_counts()[:10].index)

['192.168.70.90',
 '192.168.70.99',
 '192.168.70.95',
 '192.168.70.97',
 '192.168.70.93',
 '192.168.70.98',
 '192.168.70.91',
 '192.168.70.96',
 '192.168.70.75',
 '192.168.70.72']

In [None]:
output = {series: { type: 'sankey', layout: 'none', emphasis: { focus: 'adjacency' },\
            data: [],\
           links: [{source : , target: , value: }]}

option = {
  series: {
    type: 'sankey',
    layout: 'none',
    emphasis: {
      focus: 'adjacency'
    },
    data: [
      {
        name: 'a'
      },
      {
        name: 'b'
      },
      {
        name: 'a1'
      },
      {
        name: 'a2'
      },
      {
        name: 'b1'
      },
      {
        name: 'c'
      }
    ],
    links: [
      {
        source: 'a',
        target: 'a1',
        value: 5
      },
      {
        source: 'a',
        target: 'a2',
        value: 3
      },
      {
        source: 'b',
        target: 'b1',
        value: 8
      },
      {
        source: 'a',
        target: 'b1',
        value: 3
      },
      {
        source: 'b1',
        target: 'a1',
        value: 1
      },
      {
        source: 'b1',
        target: 'c',
        value: 2
      }
    ]
  }
};
![image.png](attachment:image.png)