# Create Network Graph


Basic setup


In [1]:
import pandas as pd
import numpy as np
from pyvis.network import Network
import networkx as nx
import seaborn as sns

Loading previously calculated dataframes


In [2]:
dataframe_dir = "MediumArticles"
# df = pd.read_csv(f"./data_output/{dataframe_dir}/chunks.csv", sep="|")
df_concepts = pd.read_csv(f"./data_output/concepts.csv", sep="|")
print(df_concepts.shape)
df_concepts.head()

(814, 5)


Unnamed: 0,entity,importance,category,chunk_id,type
0,project management institute,4.0,organisation,ce38c53d7b4440198db4d6f8ed8a2555,concept
1,practice standard for project risk management,5.0,document,ce38c53d7b4440198db4d6f8ed8a2555,concept
2,identify risks process,4.0,process,ce38c53d7b4440198db4d6f8ed8a2555,concept
3,early identification,3.0,concept,ce38c53d7b4440198db4d6f8ed8a2555,concept
4,iterative identification,3.0,concept,ce38c53d7b4440198db4d6f8ed8a2555,concept


## Calculate Graph Dataframe


Graph dataframe is a dataframe where every row is a connection between two nodes.

It is basically an inner self join of the nodes dataframe


In [3]:
dfne_join = pd.merge(
    df_concepts, df_concepts, how="inner", on="chunk_id", suffixes=("_L", "_R")
)

## Remove self Loops
self_loops_drop = dfne_join[dfne_join["entity_L"] == dfne_join["entity_R"]].index
dfg = dfne_join.drop(index=self_loops_drop).reset_index()

## This is our graph dataframe
print("Total number of nodes = ", dfg.shape[0])
dfg.head()

Total number of nodes =  7540


Unnamed: 0,index,entity_L,importance_L,category_L,chunk_id,type_L,entity_R,importance_R,category_R,type_R
0,1,project management institute,4.0,organisation,ce38c53d7b4440198db4d6f8ed8a2555,concept,practice standard for project risk management,5.0,document,concept
1,2,project management institute,4.0,organisation,ce38c53d7b4440198db4d6f8ed8a2555,concept,identify risks process,4.0,process,concept
2,3,project management institute,4.0,organisation,ce38c53d7b4440198db4d6f8ed8a2555,concept,early identification,3.0,concept,concept
3,4,project management institute,4.0,organisation,ce38c53d7b4440198db4d6f8ed8a2555,concept,iterative identification,3.0,concept,concept
4,5,project management institute,4.0,organisation,ce38c53d7b4440198db4d6f8ed8a2555,concept,emergent identification,3.0,concept,concept


#### Clean the graph dataframe


The original graph dataframe is too big to visualise. So we will make another dataframe for visualisation purpose.

-   remove the less important nodes
-   remove less important edges


In [4]:
## Drop nodes which are less important
less_important_nodes = dfg[(dfg["importance_L"] < 2)].index
## Drop edges where both the nodes are less important than 5
less_important_edges = dfg[(dfg["importance_L"] < 2) & (dfg["importance_R"] < 2)].index
drops = less_important_nodes.union(less_important_edges)

print(
    "Less important Nodes = ",
    less_important_nodes.shape[0],
    "\nLess Important Edges = ",
    less_important_edges.shape[0],
)

## Remove these rows from the graph dataframe
dfg_vis = dfg.drop(index=drops).reset_index()


Less important Nodes =  4 
Less Important Edges =  0


### Combine similar edges

Group the edges between the same nodes and combine them into single edge with its weight equal to the count. 

In [5]:

## Group and aggregate edges.
dfg_vis = (
    dfg_vis.groupby(["entity_L", "entity_R"])
    .agg(
        {
            "importance_L": "mean",
            "importance_R": "mean",
            "chunk_id": [",".join, "count"],
        }
    )
    .reset_index()
)
dfg_vis.columns = [
    "entity_L",
    "entity_R",
    "importance_L",
    "importance_R",
    "chunks",
    "count",
]

print("Final Number of Edges in the Visualisation Graph = ", dfg_vis.shape[0])
dfg_vis.head()

Final Number of Edges in the Visualisation Graph =  7274


Unnamed: 0,entity_L,entity_R,importance_L,importance_R,chunks,count
0,4.3 tools and techniques for the plan risk man...,4.3.1 planning sessions,5.0,4.0,0994a79d38794524b766b6c120c06a3b,1
1,4.3 tools and techniques for the plan risk man...,corporate governance processes,5.0,3.0,0994a79d38794524b766b6c120c06a3b,1
2,4.3 tools and techniques for the plan risk man...,elaboration of the risk management plan,5.0,3.0,0994a79d38794524b766b6c120c06a3b,1
3,4.3 tools and techniques for the plan risk man...,enterprise environmental factors,5.0,4.0,0994a79d38794524b766b6c120c06a3b,1
4,4.3 tools and techniques for the plan risk man...,organizational procedures,5.0,4.0,0994a79d38794524b766b6c120c06a3b,1


### Removing overconnected nodes

These are featured in the header and the footer of the pdf file, so they are a little too connected.


In [6]:
ind = dfg_vis[
    dfg_vis["entity_L"].isin(
        ["Pathways to Health Equity for the G20", "Accelerating Global Health"]
    )
    | dfg_vis["entity_R"].isin(
        ["Pathways to Health Equity for the G20", "Accelerating Global Health"]
    )
].index
dfg_vis.drop(index=ind, axis=1, inplace=True)
print("Final Number of Edges  = ", dfg_vis.shape[0], "\nDropped edges:", len(ind))
dfg_vis.head()

Final Number of Edges  =  7274 
Dropped edges: 0


Unnamed: 0,entity_L,entity_R,importance_L,importance_R,chunks,count
0,4.3 tools and techniques for the plan risk man...,4.3.1 planning sessions,5.0,4.0,0994a79d38794524b766b6c120c06a3b,1
1,4.3 tools and techniques for the plan risk man...,corporate governance processes,5.0,3.0,0994a79d38794524b766b6c120c06a3b,1
2,4.3 tools and techniques for the plan risk man...,elaboration of the risk management plan,5.0,3.0,0994a79d38794524b766b6c120c06a3b,1
3,4.3 tools and techniques for the plan risk man...,enterprise environmental factors,5.0,4.0,0994a79d38794524b766b6c120c06a3b,1
4,4.3 tools and techniques for the plan risk man...,organizational procedures,5.0,4.0,0994a79d38794524b766b6c120c06a3b,1


## Creating a NetworkX Graph


Calculate nodes

Here I am grouping the graph dataframe by left node and calculating the mean importance. This way we will end up with only the unique nodes from the graph dataframe along with their weights.


In [7]:
# nodes = df_graph["entity_L"].unique()
nodes = dfg_vis.groupby(["entity_L"]).agg({"importance_L": "mean"}).reset_index()
nodes.head()

Unnamed: 0,entity_L,importance_L
0,4.3 tools and techniques for the plan risk man...,5.0
1,4.3.1 planning sessions,4.0
2,a guide to the project management body of know...,4.0
3,acceptable level of risk,5.0
4,acceptance by stakeholders,5.0


Build a NetworkX object with nodes and edges


In [8]:
G = nx.Graph()
for index, row in nodes.iterrows():
    G.add_node(row["entity_L"])

for index, row in dfg_vis.iterrows():
    G.add_edge(str(row["entity_L"]), str(row["entity_R"]))

### Community Detection


Detect communities using the Girvan Newman algorithm


In [9]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))

Number of Communities =  10


Add colors to nodes based on community


In [14]:
palette = "hls"


## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"entity_L": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)

df_nodes_colors = pd.merge(
    nodes, colors, how="left", on="entity_L", suffixes=("_N", "_C")
)
# nodes.head()
df_nodes_colors.head()

Unnamed: 0,entity_L,importance_L,color,group
0,comprehensive identification,3.0,#5f57db,1
1,early identification,4.0,#57db5f,2
2,emergent identification,3.0,#db5f57,3
3,explicit identification of opportunities,3.0,#db5f57,3
4,identify risks process,5.0,#db5f57,3


So now we have a nodes dataframe with colors and sizes of each node.

lets recreate our graph.


In [15]:
G = nx.Graph()
node_size_multiple = 6

for index, row in df_nodes_colors.iterrows():
    G.add_node(
        row["entity_L"],
        size=row["importance_L"] * node_size_multiple,
        title=row["entity_L"],
        color=row["color"],
    )

for index, row in dfg_vis.iterrows():
    G.add_edge(
        str(row["entity_L"]),
        str(row["entity_R"]),
        weight=row["count"],
        name=row["chunks"],
    )

## Visualisation


In [19]:
graph_output_directory = "docs/index.html"

net = Network(
    notebook=False,
    bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    font_color="#cccccc",
    # filter_menu=True,
)

net.from_nx(G)
net.repulsion(node_distance=150, spring_length=400)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

docs/index.html
