# Create Network Graph


Basic setup


In [1]:
import pandas as pd
import numpy as np
from pyvis.network import Network
import networkx as nx
import seaborn as sns

In [4]:
import os

# Print current working directory
print("Current working directory:", os.getcwd())

# Check if directory exists
print("Directory exists:", os.path.exists("data_output/MediumArticles"))

# List contents of data_output directory
print("\nContents of data_output:")
print(os.listdir("data_output"))

# If MediumArticles exists, list its contents
if os.path.exists("data_output/MediumArticles"):
    print("\nContents of MediumArticles:")
    print(os.listdir("data_output/MediumArticles"))

Current working directory: /home/azureuser/knowledge_graph/old_notebooks
Directory exists: False

Contents of data_output:


FileNotFoundError: [Errno 2] No such file or directory: 'data_output'

Loading previously calculated dataframes


In [5]:
dataframe_dir = "MediumArticles"
# df = pd.read_csv(f"./data_output/{dataframe_dir}/chunks.csv", sep="|")
df_concepts = pd.read_csv(f"../data_output/{dataframe_dir}/concepts.csv", sep="|")
print(df_concepts.shape)
df_concepts.head()

(359, 5)


Unnamed: 0,entity,importance,category,chunk_id,type
0,interoperability,5,concept,bb894d4a987241c6b7827d4acabac6dc,concept
1,introduction,3,event,bb894d4a987241c6b7827d4acabac6dc,concept
2,unique charging port design,4,misc,ebcfb9727a42482d986fa2b0a64fa8ac,concept
3,usb c,5,object,ebcfb9727a42482d986fa2b0a64fa8ac,concept
4,mobile phone model,3,object,ebcfb9727a42482d986fa2b0a64fa8ac,concept


## Calculate Graph Dataframe


Graph dataframe is a dataframe where every row is a connection between two nodes.

It is basically an inner self join of the nodes dataframe


In [6]:
dfne_join = pd.merge(
    df_concepts, df_concepts, how="inner", on="chunk_id", suffixes=("_L", "_R")
)

## Remove self Loops
self_loops_drop = dfne_join[dfne_join["entity_L"] == dfne_join["entity_R"]].index
dfg = dfne_join.drop(index=self_loops_drop).reset_index()

## This is our graph dataframe
print("Total number of nodes = ", dfg.shape[0])
dfg.head()

Total number of nodes =  2570


Unnamed: 0,index,entity_L,importance_L,category_L,chunk_id,type_L,entity_R,importance_R,category_R,type_R
0,1,interoperability,5,concept,bb894d4a987241c6b7827d4acabac6dc,concept,introduction,3,event,concept
1,2,introduction,3,event,bb894d4a987241c6b7827d4acabac6dc,concept,interoperability,5,concept,concept
2,5,unique charging port design,4,misc,ebcfb9727a42482d986fa2b0a64fa8ac,concept,usb c,5,object,concept
3,6,unique charging port design,4,misc,ebcfb9727a42482d986fa2b0a64fa8ac,concept,mobile phone model,3,object,concept
4,7,unique charging port design,4,misc,ebcfb9727a42482d986fa2b0a64fa8ac,concept,phone companies,4,organisation,concept


#### Clean the graph dataframe


The original graph dataframe is too big to visualise. So we will make another dataframe for visualisation purpose.

-   remove the less important nodes
-   remove less important edges


In [7]:
## Drop nodes which are less important
less_important_nodes = dfg[(dfg["importance_L"] < 2)].index
## Drop edges where both the nodes are less important than 5
less_important_edges = dfg[(dfg["importance_L"] < 2) & (dfg["importance_R"] < 2)].index
drops = less_important_nodes.union(less_important_edges)

print(
    "Less important Nodes = ",
    less_important_nodes.shape[0],
    "\nLess Important Edges = ",
    less_important_edges.shape[0],
)

## Remove these rows from the graph dataframe
dfg_vis = dfg.drop(index=drops).reset_index()


Less important Nodes =  29 
Less Important Edges =  2


### Combine similar edges

Group the edges between the same nodes and combine them into single edge with its weight equal to the count. 

In [8]:

## Group and aggregate edges.
dfg_vis = (
    dfg_vis.groupby(["entity_L", "entity_R"])
    .agg(
        {
            "importance_L": "mean",
            "importance_R": "mean",
            "chunk_id": [",".join, "count"],
        }
    )
    .reset_index()
)
dfg_vis.columns = [
    "entity_L",
    "entity_R",
    "importance_L",
    "importance_R",
    "chunks",
    "count",
]

print("Final Number of Edges in the Visualisation Graph = ", dfg_vis.shape[0])
dfg_vis.head()

Final Number of Edges in the Visualisation Graph =  2481


Unnamed: 0,entity_L,entity_R,importance_L,importance_R,chunks,count
0,10000 w 100th ave,35200,2.0,2.0,45a6201014f84cc7bcb7aea24a7f7af6,1
1,10000 w 100th ave,35209,2.0,2.0,45a6201014f84cc7bcb7aea24a7f7af6,1
2,10000 w 100th ave,99def,2.0,2.0,45a6201014f84cc7bcb7aea24a7f7af6,1
3,10000 w 100th ave,adt_a01,2.0,2.0,45a6201014f84cc7bcb7aea24a7f7af6,1
4,10000 w 100th ave,al,2.0,2.0,45a6201014f84cc7bcb7aea24a7f7af6,1


### Removing overconnected nodes

These are featured in the header and the footer of the pdf file, so they are a little too connected.


In [9]:
ind = dfg_vis[
    dfg_vis["entity_L"].isin(
        ["Pathways to Health Equity for the G20", "Accelerating Global Health"]
    )
    | dfg_vis["entity_R"].isin(
        ["Pathways to Health Equity for the G20", "Accelerating Global Health"]
    )
].index
dfg_vis.drop(index=ind, axis=1, inplace=True)
print("Final Number of Edges  = ", dfg_vis.shape[0], "\nDropped edges:", len(ind))
dfg_vis.head()

Final Number of Edges  =  2481 
Dropped edges: 0


Unnamed: 0,entity_L,entity_R,importance_L,importance_R,chunks,count
0,10000 w 100th ave,35200,2.0,2.0,45a6201014f84cc7bcb7aea24a7f7af6,1
1,10000 w 100th ave,35209,2.0,2.0,45a6201014f84cc7bcb7aea24a7f7af6,1
2,10000 w 100th ave,99def,2.0,2.0,45a6201014f84cc7bcb7aea24a7f7af6,1
3,10000 w 100th ave,adt_a01,2.0,2.0,45a6201014f84cc7bcb7aea24a7f7af6,1
4,10000 w 100th ave,al,2.0,2.0,45a6201014f84cc7bcb7aea24a7f7af6,1


## Creating a NetworkX Graph


Calculate nodes

Here I am grouping the graph dataframe by left node and calculating the mean importance. This way we will end up with only the unique nodes from the graph dataframe along with their weights.


In [10]:
# nodes = df_graph["entity_L"].unique()
nodes = dfg_vis.groupby(["entity_L"]).agg({"importance_L": "mean"}).reset_index()
nodes.head()

Unnamed: 0,entity_L,importance_L
0,10000 w 100th ave,2.0
1,2010,4.0
2,30-year-old standard,4.0
3,35200,2.0
4,35209,2.0


Build a NetworkX object with nodes and edges


In [11]:
G = nx.Graph()
for index, row in nodes.iterrows():
    G.add_node(row["entity_L"])

for index, row in dfg_vis.iterrows():
    G.add_edge(str(row["entity_L"]), str(row["entity_R"]))

### Community Detection


Detect communities using the Girvan Newman algorithm


In [12]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))

Number of Communities =  27


Add colors to nodes based on community


In [13]:
palette = "hls"


## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"entity_L": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)

df_nodes_colors = pd.merge(
    nodes, colors, how="left", on="entity_L", suffixes=("_N", "_C")
)
# nodes.head()
df_nodes_colors.head()

Unnamed: 0,entity_L,importance_L,color,group
0,10000 w 100th ave,2.0,#db576c,1
1,2010,4.0,#db578a,2
2,30-year-old standard,4.0,#db57a7,3
3,35200,2.0,#db576c,1
4,35209,2.0,#db576c,1


So now we have a nodes dataframe with colors and sizes of each node.

lets recreate our graph.


In [14]:
G = nx.Graph()
node_size_multiple = 6

for index, row in df_nodes_colors.iterrows():
    G.add_node(
        row["entity_L"],
        size=row["importance_L"] * node_size_multiple,
        title=row["entity_L"],
        color=row["color"],
    )

for index, row in dfg_vis.iterrows():
    G.add_edge(
        str(row["entity_L"]),
        str(row["entity_R"]),
        weight=row["count"],
        name=row["chunks"],
    )

## Visualisation


In [17]:
from pyvis.network import Network
import networkx as nx
import os

# Create a directory to store the output if it doesn't exist
output_dir = "data_output/graph_visualization"
os.makedirs(output_dir, exist_ok=True)

# Setup network
net = Network(
    bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    font_color="#cccccc",
)

# Add the graph
net.from_nx(G)

# Set physics options
net.repulsion(node_distance=150, spring_length=400)
net.show_buttons(filter_=["physics"])

# Save the graph to the output directory
graph_file = os.path.join(output_dir, "knowledge_graph.html")
net.save_graph(graph_file)

print(f"Graph has been saved to: {graph_file}")

# Optional: Save the graph data for later use
import json
graph_data = {
    'nodes': [{'id': node, 'label': node} for node in G.nodes()],
    'edges': [{'from': u, 'to': v} for u, v in G.edges()]
}
json_file = os.path.join(output_dir, "graph_data.json")
with open(json_file, 'w') as f:
    json.dump(graph_data, f)

print(f"Graph data has been saved to: {json_file}")

Graph has been saved to: data_output/graph_visualization/knowledge_graph.html
Graph data has been saved to: data_output/graph_visualization/graph_data.json


Unable to connect to VS Code server: Error in request.
Error: connect ECONNREFUSED /run/user/1000/vscode-ipc-760f02fd-a576-4679-95e5-cf22f068d87d.sock
[90m    at PipeConnectWrap.afterConnect [as oncomplete] (node:net:1607:16)[39m {
  errno: [33m-111[39m,
  code: [32m'ECONNREFUSED'[39m,
  syscall: [32m'connect'[39m,
  address: [32m'/run/user/1000/vscode-ipc-760f02fd-a576-4679-95e5-cf22f068d87d.sock'[39m
}
