In [1]:
import json
import networkx as nx
import matplotlib.pyplot as plt
import random
from datetime import datetime

In [2]:
with open('data/crawler_results.json', 'r') as f:
    data = json.load(f)

In [3]:
def create_nx_relationships(data):
    graph = nx.Graph()  # init nx graph.
    # list of keys to capture
    entities = ['Owners', 'Owner Name',
                'Commercial Registered Agent', 'Registered Agent']

    # capture business name <> link
    for entity in entities:
        for item in data:
            # capture business name
            business_name = item['business'][1]['Business Info']['TITLE'][0].split('\n')[
                0]

            try:
                # capture respective info (iterating through links[])
                info = item['additional_information']['DRAWER_DETAIL_LIST'][entity].split('\n')[
                    0]
                # add captured business name, info to previously initialized graph as edge
                graph.add_edge(business_name, info)

            except:
                pass

    return graph

graph = create_nx_relationships(data)

## Node with Greatest Number of Connections

In [4]:
"""
Source: https://stackoverflow.com/questions/44532952/find-number-of-connected-edges-to-a-node-and-node-with-max-connected-edges
G.degree(node) gives you the degree of the node and G.degree() is a 'DegreeView' object. 
It can be converted into a dict using dict(G.degree()).
This provides us with the node with the most connections.
"""
max(dict(graph.degree()).items(), key = lambda x : x[1])

('CORPORATION SERVICE COMPANY', 18)

## Nodes with Connections Greater than N

In [13]:
N = 2
for val in dict(graph.degree()).items():
    if val[1] > N:
        print(val)

('X-RAY LIMA TANGO, LLC', 4)
('REGISTERED AGENT SOLUTIONS, INC.', 6)
('C T CORPORATION SYSTEM', 14)
('CORPORATION SERVICE COMPANY', 18)
('INCORP SERVICES, INC.', 10)
('NORTHWEST REGISTERED AGENT SERVICE, INC.', 3)
('TANNER  COLLETTE', 3)


## List of Nodes and Corresponding Edges where Connections Greater Than N

In [12]:
i = 0
date_today = datetime.today().strftime('%Y-%m-%d')
random.seed(10)  # random seed for node colours
N = 3
nodes = []

pos = nx.nx_agraph.graphviz_layout(
    graph, prog="neato")  # X,Y coords for nodes

# color nodes the same in each connected subgraph
C = (graph.subgraph(c) for c in nx.connected_components(graph))

for g in C:
    if len(g.nodes()) > N:
        nodes.append(list(g.nodes()))
        # print(list(g.nodes()))
        i += 1
        
nodes

[['XLT Pack and Ship',
  'XLT Transport',
  'XLT TAXI',
  'X-RAY LIMA TANGO, LLC',
  'CORY  KUNERTH'],
 ['XPT Specialty',
  'XCHANGE BENEFITS, LLC',
  '3H AGENT SERVICES, INC.',
  'XPT Partners, LLC'],
 ['Xcel Holdings LLC',
  'Xcel Automation',
  'TANNER  COLLETTE',
  'Xcel Aviation LLC',
  'Xcel Automation & Electric LLC'],
 ['XCLOUD, LLC',
  'X S BROKERS INSURANCE AGENCY, INC.',
  'Xencom Energy Management, LLC',
  'XN FINANCIAL SERVICES INC.',
  'REGISTERED AGENT SOLUTIONS, INC.',
  'XPO Logistics Freight, Inc.',
  'XETA TECHNOLOGIES, INC.'],
 ['XCEL ERECTORS, INC.',
  'XCED AVIATION SERVICES, LLC',
  'XCL Marketing, LLC',
  'XYLEM WATER SOLUTIONS U.S.A., INC.',
  'XCL AssetCo, LLC',
  'Xelix Distribution, Inc.',
  'XCL Resources, LLC',
  'Xeris Pharmaceuticals, Inc.',
  'C T CORPORATION SYSTEM',
  'XYLEM DEWATERING SOLUTIONS, INC.',
  'XCL RoyaltyCo, LLC',
  'XO COMMUNICATIONS SERVICES, LLC',
  'Xigent Solutions, LLC',
  'XLHome, P.C. (dba Home Health Assessments of North Dakota, 

In [8]:
def create_connection_graph(graph, node_count: int, labels=False):
    i = 0
    date_today = datetime.today().strftime('%Y-%m-%d')
    random.seed(10)  # random seed for node colours

    if labels:
        plt.figure(1, figsize=(35, 35))  # enlarge graph w/ labels
    else:
        plt.figure(1, figsize=(10, 10))

    pos = nx.nx_agraph.graphviz_layout(
        graph, prog="neato")  # X,Y coords for nodes

    # color nodes the same in each connected subgraph
    C = (graph.subgraph(c) for c in nx.connected_components(graph))
    for g in C:
        c = [random.random()] * nx.number_of_nodes(g)  # random color
        if len(g.nodes()) > node_count:
            nx.draw(g, pos, node_size=100, node_color=c, vmin=0.0, vmax=1.0,
                    with_labels=labels, bbox=dict(facecolor="whitesmoke"))
            i += 1

    plt.title(
        f'Graph Generated on {date_today}. \n There are {i} Edges with a Node Count Greater than {node_count}.')
    plt.savefig(
        f'entity_connections_graph_{node_count}_node_minimum_with_labels_{labels}.png',
        bbox_inches='tight')
    plt.close()

In [9]:
graph = create_nx_relationships(data)
create_connection_graph(graph, 1, labels=False)
create_connection_graph(graph, 2, labels=False)
create_connection_graph(graph, 2, labels=True)