In [176]:
import networkx as nx
import csv
from collections import deque


In [177]:
ImportanceNodes = {
    'Domain': 3,
    'IP': 3,
    'Cert': 3,
    'Whois_Name': 2,
    'Whois_Phone': 2,
    'Whois_Email': 2,
    'IP_C': 1,
    'ASN': 1,
}
ImportanceLinks = {
    'r_cert': 4,
    'r_subdomain': 4,
    'r_request_jump': 4,
    'r_dns_a': 4,
    'r_whois_name': 3,
    'r_whois_email': 3,
    'r_whois_phone': 3,
    'r_cert_chain': 2,
    'r_cname': 2,
    'r_asn': 1,
    'r_cidr': 1,
}


def read_in_all_graph():  # 用函数包裹一下，避免可能的变量污染
    graph_all = nx.Graph()  # undirected graph
    # graph_all = nx.DiGraph()  # directed graph
    with open('data/Node.csv', 'r', encoding='utf-8') as nodes:
        nodes.readline()  # get rid of first line
        reader = csv.reader(nodes)
        for data in reader:
            graph_all.add_node(
                data[0],
                name=data[1],
                type=data[2],
                industry=data[3],
                classified=False,  # 是否已经分到一个group里
                is_core_node=False,  # 是否是核心节点
                importance=ImportanceNodes[data[2]],  # 重要性，1-3
            )

    with open('data/Link.csv', 'r', encoding='utf-8') as links:
        links.readline()  # get rid of first line
        reader = csv.reader(links)
        for data in reader:
            graph_all.add_edge(
                data[1],  # source
                data[2],  # target
                relation=data[0],
                importance=ImportanceLinks[data[0]],
            )

    return graph_all


In [178]:
graph_all = read_in_all_graph()
print("graph_all loading complete!")


graph_all loading complete!


### First group

In [179]:
init_id = [
    'Domain_c58c149eec59bb14b0c102a0f303d4c20366926b5c3206555d2937474124beb9',
    'Domain_f3554b666038baffa5814c319d3053ee2c2eb30d31d0ef509a1a463386b69845',
]

subgraph_id = set()
mining_depth = 2

min_num_edges = 7  # judge core node
max_num_neighbors = 30  # visualize network


In [180]:
for id in init_id:
    subgraph_id.add(id)

    # BFS search
    stack = deque([id])
    level = 0  # level < mining_depth
    while stack:
        for _ in range(len(stack)):
            node_id = stack.popleft()

            # get all neighbors
            neighbors = list(graph_all.neighbors(node_id))  # 迭代器转为list，重复遍历
            # judge if is core node
            if graph_all.nodes[node_id]['importance'] == 3:
                if len(neighbors) >= min_num_edges:
                    weak_edges = 0  # 50%以上的邻边关联强度较弱的网络资产不被认为是核心网络资产
                    num_ip = 0  # 同时关联2个以上IP地址的Domain网络资产不被认为是核心网络资产
                    for neighbor in neighbors:
                        if graph_all.nodes[neighbor]['type'] == 'IP':
                            num_ip += 1
                        if graph_all.edges[node_id, neighbor]['importance'] == 1:
                            weak_edges += 1

                    if num_ip < 2 and weak_edges / len(neighbors) < 0.5:  # 核心资产
                        graph_all.nodes[node_id]['is_core_node'] = True

            # visualize the subgraph
            num_neighbors = 0  # if num > max_num_neighbors, stop
            for neighbor in neighbors:
                if neighbor not in subgraph_id:
                    subgraph_id.add(neighbor)
                    graph_all.nodes[neighbor]['classified'] = True  # 分到一个group里
                    stack.append(neighbor)
                    num_neighbors += 1

                if num_neighbors > max_num_neighbors:
                    break

        # level order
        level += 1
        if level >= mining_depth:
            break

len(subgraph_id)


181

In [181]:
for id in subgraph_id:
    if graph_all.nodes[id]['is_core_node']:
        print(id)
        # graph_all.nodes[id]['is_core_node'] = False


Domain_807a8a1a5282607c2063a5b0365268b25f0f62d61741a3e25244840595f10dc3
Cert_fe794a69eacd63b21245bf4eda826222fc6c5862bebf77aa05459cb308cfd063
Domain_c58c149eec59bb14b0c102a0f303d4c20366926b5c3206555d2937474124beb9
IP_94fb4d47d3920b6a5b74a8ce9e304377460fdffdf6582eca97eda2037bbe0b47
Domain_61befc7014010dffed9239b8a133396e1a88fa822703b677b123268eb16be3f7


### Second group

In [182]:
init_id = [
    'IP_400c19e584976ff2a35950659d4d148a3d146f1b71692468132b849b0eb8702c',
    'Domain_b10f98a9b53806ccd3a5ee45676c7c09366545c5b12aa96955cde3953e7ad058',
]

subgraph_id = set()
mining_depth = 3

min_num_edges = 10  # judge core node
max_num_neighbors = 40  # visualize network


In [183]:
for id in init_id:
    subgraph_id.add(id)

    # BFS search
    stack = deque([id])
    level = 0  # level < mining_depth
    while stack:
        for _ in range(len(stack)):
            node_id = stack.popleft()

            # get all neighbors
            neighbors = list(graph_all.neighbors(node_id))  # 迭代器转为list，重复遍历
            # judge if is core node
            if graph_all.nodes[node_id]['importance'] == 3:
                if len(neighbors) >= min_num_edges:
                    weak_edges = 0  # 50%以上的邻边关联强度较弱的网络资产不被认为是核心网络资产
                    num_ip = 0  # 同时关联2个以上IP地址的Domain网络资产不被认为是核心网络资产
                    for neighbor in neighbors:
                        if graph_all.nodes[neighbor]['type'] == 'IP':
                            num_ip += 1
                        if graph_all.edges[node_id, neighbor]['importance'] == 1:
                            weak_edges += 1

                    if num_ip < 2 and weak_edges / len(neighbors) < 0.5:  # 核心资产
                        graph_all.nodes[node_id]['is_core_node'] = True

            # visualize the subgraph
            num_neighbors = 0  # if num > max_num_neighbors, stop
            for neighbor in neighbors:
                if neighbor not in subgraph_id:
                    subgraph_id.add(neighbor)
                    graph_all.nodes[neighbor]['classified'] = True  # 分到一个group里
                    stack.append(neighbor)
                    num_neighbors += 1

                if num_neighbors > max_num_neighbors:
                    break

        # level order
        level += 1
        if level >= mining_depth:
            break

len(subgraph_id)


735

In [184]:
for id in subgraph_id:
    if graph_all.nodes[id]['is_core_node']:
        print(id)
        # graph_all.nodes[id]['is_core_node'] = False


IP_8c2235f7cb908ecc15ac1f0e612710ad8023f043bd5849b43368775c51473c1a
IP_2493d10250d8cfb277bb6217bd48e2863d4a1bf81fb6cc9e755f63a88a349f32
Cert_c992a7d7f01fae6098d8f1ba358002074db1b977cceafc07c04b40e657ec0425
Domain_e01c17f42e1a41199b7811c6effb0caed00040c008ac6e0b644501b254da767a
IP_f9b588fa3410ab89fa0e50b011c9ac8ddfa4a3125ea3df13fa4598faa5e15f8a
IP_dcf3630d109e25e31f4ec590e83509d90a95874ed53e32914e0eb2f4b56e6639
IP_543d6eb907353987e07d47452da5fdad313cc8adb8cf03943274664607f3c828
IP_36b2ba5b0800d154ef3add5672b7561af9535edd92d2c3323c64880498b45a05
IP_13d40ae506b3217b4f51db8809664bc5b01eae4ea696ed85821ebe9b8a9d1117
Domain_8659e9de39a88dc208eae9c4eab0791afd0406142fd7220cac3e7793dc802a43
IP_80f8de0a43fddb5a51f3d395fc56dd1b50f78cc1d853449918d200fbe9baaea0
IP_87221ece371f1ef485addd26120c644808f4fb84f1088124d4a40bcc36468afb
IP_f36d5f4bcff7228569d004693e08d93cf9a7214bbf163b42c98f62de5c62febe
IP_c75ff0eef6197b346ed7b7a12b20cf8f526ee56eda23c19281186349f3004128
IP_d8445ec5260aaece527bc20643af2ef4cb5

### Third group

In [185]:
init_id = [
    'IP_7e730b193c2496fc908086e8c44fc2dbbf7766e599fabde86a4bcb6afdaad66e',
    'Cert_6724539e5c0851f37dcf91b7ac85cb35fcd9f8ba4df0107332c308aa53d63bdb',
]

subgraph_id = set()
mining_depth = 3


In [186]:

min_num_edges = 10  # judge core node
max_num_neighbors = 30  # visualize network


In [187]:
for id in init_id:
    subgraph_id.add(id)

    # BFS search
    stack = deque([id])
    level = 0  # level < mining_depth
    while stack:
        for _ in range(len(stack)):
            node_id = stack.popleft()

            # get all neighbors
            neighbors = list(graph_all.neighbors(node_id))  # 迭代器转为list，重复遍历
            # judge if is core node
            if graph_all.nodes[node_id]['importance'] == 3:
                if len(neighbors) >= min_num_edges:
                    weak_edges = 0  # 50%以上的邻边关联强度较弱的网络资产不被认为是核心网络资产
                    num_ip = 0  # 同时关联2个以上IP地址的Domain网络资产不被认为是核心网络资产
                    for neighbor in neighbors:
                        if graph_all.nodes[neighbor]['type'] == 'IP':
                            num_ip += 1
                        if graph_all.edges[node_id, neighbor]['importance'] == 1:
                            weak_edges += 1

                    if num_ip < 2 and weak_edges / len(neighbors) < 0.5:  # 核心资产
                        graph_all.nodes[node_id]['is_core_node'] = True

            # visualize the subgraph
            num_neighbors = 0  # if num > max_num_neighbors, stop
            for neighbor in neighbors:
                if neighbor not in subgraph_id:
                    subgraph_id.add(neighbor)
                    graph_all.nodes[neighbor]['classified'] = True  # 分到一个group里
                    stack.append(neighbor)
                    num_neighbors += 1

                if num_neighbors > max_num_neighbors:
                    break

        # level order
        level += 1
        if level >= mining_depth:
            break

len(subgraph_id)


2617

In [188]:
for id in subgraph_id:
    if graph_all.nodes[id]['is_core_node']:
        print(id)
        # graph_all.nodes[id]['is_core_node'] = False


IP_e3ad5a4d92371d8a7131a64cbc99eac605e747eeb1f9768d3406dbb8ae05dccb
IP_6fb775ff1d42248cec2de73b3c0469e81bd1c11751f9801b7b69a0bcc12deec8
Domain_bef7711a775534636a7cdae48a0f9d7604c5dadcf31f32f6117d6db4b00555ac
IP_ac1ccc9721107587b330cae0ef1446269a02ce708189dccca916e4d6a01bb4d1
IP_93a303ba531ddbe61085d9cfb4d404e874c0634939a1dd5a8435646c8b82cf1f
IP_7b9954624b78d63b870d3d03daea2c89620ce866901c3affc07554fde1aa5f7f
IP_e194cdba3af1ca142453788c2fa3a6ceb2ab920bbcdf670e4399d97b798d0545
IP_fd274aab417beb2bbd649cce02bd98f38483b40fbc781747cdeac4a83bb0a58c
IP_4ea0cbe0ae9dc770e66a956fbe088134aa68b55b43ad9b1f60a7c66ae661b763
IP_ba509acc37b1a4ac6687b2f79ea6188b8dfcc09c43e3d03c0209bf7a35d74569
IP_c8c1b9572d1097792254051a91fc2b667411b1dbefd721a603c6fa99bbe1d593
IP_48cc8fb4cbae8e075772452280942726e4005062f2006eba0f8a5e97307a2e44
IP_207185a77262f6b5c76ac215b948550372574b92a3effcfe05ed5ef3c214eace
IP_ea400d44363c5f7bf0eeea84877353fbaf386305c0336a0367ffd9069b2982ad
IP_a2ebd3c022a5320f6ee011c272706c6fc4e702898