In [7]:
import networkx as nx
from pathlib import Path
from utils import read_json
import pickle

# Load jsons

In [2]:
entities = read_json(Path("../data/littlesis-entities.json"))
relationships = read_json(Path("../data/littlesis-relationships.json"))

# Helper functions

In [3]:
def print_graph_info(G: nx.Graph):
    print("Number of nodes: ", G.number_of_nodes())
    print("Number of edges: ", G.number_of_edges())
    print()
    print("Number of connected components: ", nx.number_connected_components(G))
    connected_components = sorted(nx.connected_components(G), key=len, reverse=True)
    three_largest_components = connected_components[0:3]
    for i, component in enumerate(three_largest_components):
        print(
            f"{i+1}. Size of the connected component: {(len(component)/G.number_of_nodes())*100:.2f}%"
        )

    print()

    # top five nodes based on pagerank
    pr = nx.pagerank(G)
    top_ten_pagerank = sorted(pr, key=pr.get, reverse=True)[:10]
    print("Top 10 nodes based on pagerank:")
    for i, node in enumerate(top_ten_pagerank):
        print(
            f"#{i+1}\t: nodeid ({node}), pagerank ({pr[node]:.6f}) -> {G.nodes[node]['name']} ({G.nodes[node]['primary_ext']})"
        )

# Original littlesis graph

In [4]:
# Build the graph with all entities and relationships
G = nx.Graph()
for entity in entities:
    G.add_node(
        entity["id"],
        id=entity["attributes"]["id"],
        name=entity["attributes"]["name"],
        primary_ext=entity["attributes"]["primary_ext"],
    )
for relationship in relationships:
    G.add_edge(
        relationship["attributes"]["entity1_id"],
        relationship["attributes"]["entity2_id"],
        category_id=relationship["attributes"]["category_id"],
        amount=relationship["attributes"]["amount"],
    )

In [5]:
print("Graph with all entities and relationships:")
print_graph_info(G)

Graph with all entities and relationships:
Number of nodes:  413057
Number of edges:  1685094

Number of connected components:  51414
1. Size of the connected component: 85.83%
2. Size of the connected component: 0.02%
3. Size of the connected component: 0.02%

Top 10 nodes based on pagerank:
#1	: nodeid (13503), pagerank (0.002785) -> Barack Obama (Person)
#2	: nodeid (28862), pagerank (0.001540) -> Democratic National Committee (Org)
#3	: nodeid (13191), pagerank (0.001480) -> Hillary Clinton (Person)
#4	: nodeid (34136), pagerank (0.001431) -> Mitt Romney (Person)
#5	: nodeid (28776), pagerank (0.001403) -> George W Bush (Person)
#6	: nodeid (28778), pagerank (0.001264) -> Republican National Committee (Org)
#7	: nodeid (13377), pagerank (0.001216) -> John Kerry (Person)
#8	: nodeid (13443), pagerank (0.001096) -> John S. McCain III (Person)
#9	: nodeid (88818), pagerank (0.000956) -> Eni SpA (Org)
#10	: nodeid (12884), pagerank (0.000879) -> US House of Representatives (Org)


# Only inspecting the biggest connected component

In [6]:
# Create a subgrah with the largest connected component
connected_components = sorted(nx.connected_components(G), key=len, reverse=True)
largest_component: set = connected_components[0]
G_largest = G.subgraph(largest_component)
print("Created the full graph that has the following information:")
print_graph_info(G_largest)

Created the full graph that has the following information:
Number of nodes:  354538
Number of edges:  1677720

Number of connected components:  1
1. Size of the connected component: 100.00%

Top 10 nodes based on pagerank:
#1	: nodeid (13503), pagerank (0.004063) -> Barack Obama (Person)
#2	: nodeid (88818), pagerank (0.002455) -> Eni SpA (Org)
#3	: nodeid (28862), pagerank (0.002351) -> Democratic National Committee (Org)
#4	: nodeid (13191), pagerank (0.002178) -> Hillary Clinton (Person)
#5	: nodeid (34136), pagerank (0.002136) -> Mitt Romney (Person)
#6	: nodeid (28776), pagerank (0.002134) -> George W Bush (Person)
#7	: nodeid (28778), pagerank (0.001885) -> Republican National Committee (Org)
#8	: nodeid (13377), pagerank (0.001865) -> John Kerry (Person)
#9	: nodeid (13443), pagerank (0.001673) -> John S. McCain III (Person)
#10	: nodeid (28856), pagerank (0.001297) -> Democratic Senatorial Campaign Committee (Org)


# Save the largest connected component as a pickle

In [8]:
pickle.dump(G_largest, open("../data/littlesis-largest-component.pickle", "wb"))