# Introduction

**The goal of this project is to grab the data from elasticsearch(from 192.168.70.102), and produce an specified format output for echart so that we can show it on our service. The main function is the cross_search fuction. In that function, we use get_exdata_pass_14_days() to grab the data in pass 14 days, then filter the data based on the parameters that consumers choose, and finally, make it as the format we want.**

# Import libraries

In [1]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import MultiMatch
from elasticsearch_dsl import Q
from datetime import datetime
from datetime import timedelta
import pandas as pd
import numpy as np
import json
from itertools import product

# Create fuctions

In [2]:
def connect_ES():
    return Elasticsearch(hosts='192.168.70.102', port=9200,timeout=60)

In [3]:
# This fuction is used to get the data in pass 14 days
def get_exdata_pass_14_days():
    
    # To specify the time range for choosing index
    now = datetime.today()
    now_strft = datetime.strftime(now,"%Y-%m-%d")
    before_strft = datetime.strftime(now-timedelta(14),"%Y-%m-%d")
    now_year = now_strft[2:4]
    before_year = before_strft[2:4]
    now_month = now_strft[5:7]
    before_month = before_strft[5:7]
    inter_index = "ai-alert-internet-20"
    intra_index = "ai-alert-intranet-20"
    if now_year==before_year and now_month==before_month: 
        # If the pass time and today are in the same year and same month
        index_list = [inter_index+f'{now_year}-{now_month}',intra_index+f'{now_year}-{now_month}']
    elif now_year==before_year and now_month!=before_month: 
        # If the pass time and today are in the same year but in different month
        index_list = [inter_index+f'{now_year}-{now_month}',inter_index+f'{now_year}-{before_month}', \
                      intra_index+f'{now_year}-{before_month}',intra_index+f'{now_year}-{before_month}']
    elif now_year!=before_year and now_month!=before_month:
        # If the pass time and today are in both differnent year and different month
        index_list = [inter_index+f'{now_year}-{now_month}',inter_index+f'{before_year}-{before_month}', \
                      intra_index+f'{now_year}-{now_month}',intra_index+f'{before_year}-{before_month}']                
    else:
        print("There are some errors, please try again!")
        
    # Start grabing data
    es = connect_ES() # connect to elasticsearch
    search = Search(using=es, index=index_list) \
            .filter("range", ingest_timestamp = {"gt":f"now-14d/d", "lt":"now"}) \
            .extra(from_=0, size=10000)
    response = search.execute() #Execute the query
    response_list = response.hits.hits
    # Transform those data to dataframe so that we can get the data we need quickly
    data_list = []
    for i in response_list:
        data_list.append(i["_source"].to_dict())
    df = pd.DataFrame(data_list)
    df_clean = df[['alert_id','src_ip','dst_ip']]
    df_clean['dst_ip'] = df_clean['dst_ip'].apply(lambda s : [s] if type(s)!= type(list()) else s)
    return df_clean

In [4]:
def cross_search(order,choice):
    
    #取出前14天的資料
    df = get_exdata_pass_14_days()
    
    #前處理->將dst_ip展開，並drop掉原本dst_ip的column
    span_data = [(i[1],i[2],col_3) for i in df.itertuples() for col_3 in i[3]]
    df = pd.DataFrame(span_data,columns=df.columns)
    df = df[order]
    df.sort_values(order,inplace=True)
    
    #開始根據使用者輸入的參數進行filter->如果沒選擇特定值，則用出現頻率的前10名進行filter
    for o,c in zip(order,choice):
        if c!=None:  #有輸入值的狀況
            df = df.query(f"{o}=='{c}'")
        else:  #沒有輸入值的狀況(取前10)
            top10_lst = [i for i in df[o].value_counts()[:10].index]
            df = df[df[o].isin(top10_lst)]
        
        
    # 開始處理河流圖output
    
    # 分別取出每一欄的值(不重複)並做成list，以便跑回圈使用
    col1_unique = list(set(df[df.columns[0]]))
    col2_unique = list(set(df[df.columns[1]]))
    col3_unique = list(set(df[df.columns[2]]))
    # 處理前兩個column的links->分為source、target及value等3部分，用前兩欄位的所有值組合去跑回圈
    first_two = [{'source' : s, 'target': t, 'value': \
                 len(df.query(f"{df.columns[0]}=='{s}' and {df.columns[1]}=='{t}'"))} \
                 for s,t in product(col1_unique,col2_unique) \
                 #如果有值(有match到)的話在塞進first_two中
                 if df.query(f"{df.columns[0]}=='{s}' and {df.columns[1]}=='{t}'").empty==False]
    # 處理後兩個column的links->分為source、target及value等3部分，用後兩欄位的所有值組合去跑回圈
    last_two = [{'source' : s, 'target': t, 'value': \
                 len(df.query(f"{df.columns[1]}=='{s}' and {df.columns[2]}=='{t}'"))} \
                 for s,t in product(col2_unique,col3_unique) \
                #如果有值(有match到)的話在塞進first_two中
                 if df.query(f"{df.columns[1]}=='{s}' and {df.columns[2]}=='{t}'").empty==False]
    # 組出output : 1.name->依序取出dataframe中所有欄位的值(不重複) 2.links->將兩組links組合
    output = {'data': [{'name':str(i)} for i in list(df.unstack().unique())],\
           'links': first_two+last_two}
    return output

## Function test

In [5]:
cross_search(order=['alert_id','src_ip','dst_ip'],choice=[None,None,None])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


{'data': [{'name': 'T0803'},
  {'name': 'T0814'},
  {'name': 'T0817'},
  {'name': 'T0832'},
  {'name': 'T0852'},
  {'name': 'T0859'},
  {'name': 'T0861'},
  {'name': 'T0875'},
  {'name': 'T0885'},
  {'name': 'T0891'},
  {'name': '192.168.70.93'},
  {'name': '192.168.70.95'},
  {'name': '192.168.70.96'},
  {'name': '192.168.70.90'},
  {'name': '192.168.70.94'},
  {'name': '192.168.70.97'},
  {'name': '192.168.70.98'},
  {'name': '192.168.70.99'},
  {'name': '192.168.70.92'},
  {'name': '192.168.70.91'},
  {'name': '192.168.70.81'},
  {'name': '192.168.70.82'},
  {'name': '192.168.70.85'},
  {'name': '192.168.70.87'},
  {'name': '192.168.70.88'},
  {'name': '192.168.70.89'},
  {'name': '192.168.70.80'},
  {'name': '192.168.70.83'},
  {'name': '192.168.70.84'},
  {'name': '192.168.70.86'}],
 'links': [{'source': 'T0861', 'target': '192.168.70.93', 'value': 20},
  {'source': 'T0861', 'target': '192.168.70.90', 'value': 10},
  {'source': 'T0891', 'target': '192.168.70.99', 'value': 10},
  {

# Example : use the output to plot an echart

![image.png](attachment:image.png)