## Setup

In [25]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "adhd_2"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [2]:
# Dir PDF Loader
loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader(Path(f"./data_input/test.pdf"))
# loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
# print(pages[3].page_content)


Overwriting cache for 0 988
Overwriting cache for 0 388
Overwriting cache for 0 115
Overwriting cache for 0 336


Number of chunks =  521


## Create a dataframe of all the chunks

In [3]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(521, 4)


Unnamed: 0,text,source,page,chunk_id
0,Practitioner Review: Emotional dysregulation i...,data_input/adhd_2/Faraone 2018 Emotional Dysre...,0,6a39139b493549ff8e048e1247d0cb24
1,with mood disorders. Methods: We sought to cla...,data_input/adhd_2/Faraone 2018 Emotional Dysre...,0,2a71bce1b84d4d178461bb61f6c2caa4
2,DESR which demarcates them from irritability a...,data_input/adhd_2/Faraone 2018 Emotional Dysre...,0,553bc87a53be4191ac697d82e6bdc4a0
3,with ADHD have signiﬁcant impairments that ste...,data_input/adhd_2/Faraone 2018 Emotional Dysre...,0,7d9284724aee49928a1c92f4d101646c
4,models of ADHD that emphasize the inadequacy o...,data_input/adhd_2/Faraone 2018 Emotional Dysre...,1,b68e2ca1203648488658bf36f8d99ada


## Extract Concepts

In [23]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [5]:
import ollama.client as client
import json

# m = client.generate(model_name='llama3.1:8b', system=None, prompt="Hello how are you? return message in JSON. {'message': YOUR_MESSAGE}")[0]
m, _ = client.chat(model_name='llama3.1:8b', messages=[
    {
      "role": "system",
      "content": "You are a helpful assistant. Respond with only one sentence to questions. Return your answer in JSON format: {'answer': YOUR_ANSWER}"
    },
    {
      "role": "user",
      "content": "why is the sky blue?"
    }
  ])

#check if m is valid json
try:
    json.loads(m)
    print("Valid JSON")
except:
    print("Invalid JSON")


# client.list()

{"answer": "The sky appears blue because of a phenomenon called scattering, where sunlight interacts with the tiny molecules of gases in the atmosphere."}Valid JSON


In [6]:
# import time 
# time.sleep(60 * 60 * 0.5)

In [7]:
## To regenerate the graph with LLM, set this to True
regenerate = False

if regenerate:
    concepts_list = df2Graph(df, model='llama3.1:8b')
    # concepts_list = df2Graph(df, model='adrienbrault/nous-hermes2pro-llama3-8b:q8_0') 
    # concepts_list = df2Graph(df, model='llama3:70b')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1 = dfg1[["node_1", "node_1_type", "node_2", "node_2_type", 'edge', 'chunk_id']]
dfg1.dropna(subset=["node_1", "node_1_type", "node_2", "node_2_type", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)

{
"edges": [
    {
        "node_1": "Emotional dysregulation",
        "node_1_type": "concept",
        "node_2": "Attention-deficit/hyperactivity disorder (ADHD)",
        "node_2_type": "condition",
        "edge": "Emotional dysregulation is a core feature of ADHD"
    },
    {
        "node_1": "Morbidity",
        "node_1_type": "concept",
        "node_2": "Emotional symptoms in ADHD patients",
        "node_2_type": "documents|service",
        "edge": "Morbidity is associated with emotional symptoms in ADHD patients"
    },
    {
        "node_1": "Diagnostic criteria",
        "node_1_type": "concept",
        "node_2": "Emotional symptoms in ADHD",
        "node_2_type": "documents|service",
        "edge": "Emotional symptoms are too nonspecific for use as diagnostic criteria"
    },
    {
        "node_1": "Mood disorders",
        "node_1_type": "condition",
        "node_2": "Emotional symptoms in ADHD",
        "node_2_type": "documents|service",
        "edge": "There

In [8]:
for index, row in dfg1.iterrows():
    if '|' in row['node_1_type']:
        dfg1.at[index, 'node_1_type'] = row['node_1_type'].split('|')[0]

    if '|' in row['node_2_type']:
        dfg1.at[index, 'node_2_type'] = row['node_2_type'].split('|')[0]

for index, row in dfg1.iterrows():
    if row['node_1_type'] not in ['object', 'entity', 'location', 'organization', 'person', 'condition', 'documents', 'service', 'concept', 'date']:
        dfg1.at[index, 'node_1_type'] = 'other'

    if row['node_2_type'] not in ['object', 'entity', 'location', 'organization', 'person', 'condition', 'documents', 'service', 'concept', 'date']:
        dfg1.at[index, 'node_2_type'] = 'other'

dfg1['edge_type'] = 'relation'
dfg1.head()

Unnamed: 0,node_1,node_1_type,node_2,node_2_type,edge,chunk_id,count,edge_type
0,emotional dysregulation,concept,attention-deficit/hyperactivity disorder (adhd),condition,Emotional dysregulation is a core feature of ADHD,6a39139b493549ff8e048e1247d0cb24,4,relation
1,morbidity,concept,emotional symptoms in adhd patients,documents,Morbidity is associated with emotional symptom...,6a39139b493549ff8e048e1247d0cb24,4,relation
2,diagnostic criteria,concept,emotional symptoms in adhd,documents,Emotional symptoms are too nonspecific for use...,6a39139b493549ff8e048e1247d0cb24,4,relation
3,mood disorders,condition,emotional symptoms in adhd,documents,There is overlap between mood disorders and em...,6a39139b493549ff8e048e1247d0cb24,4,relation
4,stephen v. faraone,person,"departments of psychiatry, neuroscience and ph...",organization,Stephen V. Faraone is affiliated with Departme...,6a39139b493549ff8e048e1247d0cb24,4,relation


In [9]:
df_node_types = dfg1[['node_1','node_1_type']].rename(columns={'node_1': 'node', 'node_1_type': 'node_type'})
df_node_types = df_node_types._append(dfg1[['node_2','node_2_type']].rename(columns={'node_2': 'node', 'node_2_type': 'node_type'}))
df_node_types.drop_duplicates(subset=['node'], inplace=True)

df_node_types

Unnamed: 0,node,node_type
0,emotional dysregulation,concept
1,morbidity,concept
2,diagnostic criteria,concept
3,mood disorders,condition
4,stephen v. faraone,person
...,...,...
3920,chancellors award for excellence in scholarshi...,other
3921,distinguished profe ssor,concept
3922,lifetime ach ievement award,concept
3923,pau l hoch award,concept


## Calculating contextual proximity

In [10]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )

    dfg_long.drop(columns=["variable"], inplace=True)

    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))

    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)

    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )

    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)

    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge_type"] = "contextual proximity"
    dfg2["edge"] = "exists is same context"
    dfg2["node_1_type"] = dfg2['node_1'].apply(lambda x: df_node_types[df_node_types['node'] == x]['node_type'].iloc[0])
    dfg2["node_2_type"] = dfg2['node_2'].apply(lambda x: df_node_types[df_node_types['node'] == x]['node_type'].iloc[0])

    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge_type,edge,node_1_type,node_2_type
56285,"€304 (du rietz et al., 2020)",adhd,"aa571a687c484808b18970f2c9bf3ae0,aa571a687c484...",8,contextual proximity,exists is same context,other,condition
56291,"€4,000",adhd,"2339bf2fce3c4d6bb293bf62ba645212,2339bf2fce3c4...",11,contextual proximity,exists is same context,documents,condition
56302,"€5,500",adhd,"2339bf2fce3c4d6bb293bf62ba645212,2339bf2fce3c4...",11,contextual proximity,exists is same context,documents,condition
56313,"€8,000",adhd,"2339bf2fce3c4d6bb293bf62ba645212,2339bf2fce3c4...",11,contextual proximity,exists is same context,documents,condition
56325,"€9,860 to €14,483 per patient per year",adhd-related costs,"2c6d8501722746a6a4ab2df9d928e1e0,2c6d850172274...",4,contextual proximity,exists is same context,concept,documents


### Merge both the dataframes

In [11]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum', 'edge_type': 'first', 'node_1_type': 'first', 'node_2_type': 'first'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count,edge_type,node_1_type,node_2_type
0,$47.55 million,adhd,"aa571a687c484808b18970f2c9bf3ae0,aa571a687c484...",exists is same context,8,contextual proximity,other,condition
1,"$47.55 million (hong et al., 2020)",adhd,"aa571a687c484808b18970f2c9bf3ae0,aa571a687c484...",exists is same context,8,contextual proximity,other,condition
2,$77.5 to $115.9 billion,adult adhd,"cc14ef2eeea84b87a16e391c92c389e7,cc14ef2eeea84...",exists is same context,2,contextual proximity,documents,condition
3,$77.5 to $115.9 billion,substantial impairments,"cc14ef2eeea84b87a16e391c92c389e7,cc14ef2eeea84...",exists is same context,2,contextual proximity,documents,condition
4,$77.5 to $115.9 billion,traffic accidents,"cc14ef2eeea84b87a16e391c92c389e7,cc14ef2eeea84...",exists is same context,2,contextual proximity,documents,documents
...,...,...,...,...,...,...,...,...
18924,"€304 (du rietz et al., 2020)",adhd,"aa571a687c484808b18970f2c9bf3ae0,aa571a687c484...",exists is same context,8,contextual proximity,other,condition
18925,"€4,000",adhd,"2339bf2fce3c4d6bb293bf62ba645212,2339bf2fce3c4...",exists is same context,11,contextual proximity,documents,condition
18926,"€5,500",adhd,"2339bf2fce3c4d6bb293bf62ba645212,2339bf2fce3c4...",exists is same context,11,contextual proximity,documents,condition
18927,"€8,000",adhd,"2339bf2fce3c4d6bb293bf62ba645212,2339bf2fce3c4...",exists is same context,11,contextual proximity,documents,condition


## Generate the NetworkX Graph

In [12]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(3365,)

In [13]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node),
        node_type=df_node_types[df_node_types['node'] == node]['node_type'].iloc[0]
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        edge_title=row["edge_type"],
        edge_details=row["edge"],
        weight=row['count']/4,
        ref=row["chunk_id"]
    )

## Eliminate Redundant Nodes using Embedding Space

In [14]:
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer
import networkx as nx
from tqdm import tqdm
from sklearn.cluster import DBSCAN

# Load the pre-trained language model and tokenizer
model_name = "Alibaba-NLP/gte-base-en-v1.5"
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Define a function to compute the node embeddings
def compute_node_embeddings(nodes):
    embeddings = []
    for node in tqdm(nodes):
        inputs = tokenizer(node, return_tensors="pt", max_length=128, truncation=True)
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :]  # extract the embedding of the [CLS] token
        embeddings.append(embedding.detach().numpy())
    return np.array(embeddings)

# Compute the node embeddings
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
node_embeddings = compute_node_embeddings(nodes)


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 3365/3365 [01:07<00:00, 49.57it/s]


In [15]:
# Define a function to cluster similar nodes
def cluster_nodes(node_embeddings):
    cluster_labels = DBSCAN(eps=0.1, min_samples=1, metric="cosine").fit_predict(node_embeddings.squeeze(axis=1))
    # print(set(cluster_labels))
    cluster_to_nodes = {}
    for node, label in zip(nodes, cluster_labels):
        if label not in cluster_to_nodes:
            cluster_to_nodes[label] = []
        cluster_to_nodes[label].append(node)
    return cluster_labels, cluster_to_nodes

# Cluster similar nodes
labels, cluster_to_nodes = cluster_nodes(node_embeddings)

# Print the nodes that clustered together
for label, nodes_in_cluster in cluster_to_nodes.items():
    if len(nodes_in_cluster) > 1:
        print(f"Cluster {label}: {nodes_in_cluster}")

# Create a new graph with merged nodes
G_merged = nx.Graph()
node_to_label = {node: label for node, label in zip(nodes, labels)}
label_to_node = {}

for label, nodes_in_cluster in cluster_to_nodes.items():
    node_counts = {}
    for node in nodes_in_cluster:
        node_counts[node] = list(nodes).count(node)
    most_common_node = max(node_counts, key=node_counts.get)
    label_to_node[label] = most_common_node


## Add nodes to the graph
for node in nodes:
    label = node_to_label[node]
    new_node = label_to_node[label]

    if new_node not in G_merged.nodes:
        G_merged.add_node(
            str(new_node),
            node_label=str(new_node),
            node_type=df_node_types[df_node_types['node'] == node]['node_type'].iloc[0]
        )

## Add edges to the graph
for index, row in dfg.iterrows():
    n1 = label_to_node[node_to_label[str(row["node_1"])]]
    n2 = label_to_node[node_to_label[str(row["node_2"])]]
    if n1 != n2:
        G_merged.add_edge(
            str(n1),
            str(n2),
            edge_title=row["edge_type"],
            edge_details=row["edge"],
            weight=row['count']/4,
            ref=row["chunk_id"]
        )

print('Previous Number of nodes: ', G.number_of_nodes())
print('Updated Number of nodes: ', G_merged.number_of_nodes())

G = G_merged
nodes = G.nodes()

Cluster 3: ['% of patients medicated for adhd ', '% of patients with adhd ', '% patients prescribed medication for adhd ', '% patients treated for adhd ', '% patients treated with adhd medication', '% patients treated with adhd medications', '% patients with adhd ', '% patients with adhd referred ', '% patients with adhd where ']
Cluster 10: ['% patients stabilized on an adhd medication ', 'patients stabilized on an adhd medication', '% patients stabilized on an adhd medication seen at least once per year']
Cluster 17: ['2–4 hours', '4–6 hours', '6–8 hours', '8–12 hours']
Cluster 31: ['abdominal pain', 'stomach discomfort', 'stomach pain', 'stomachache', 'stomachaches']
Cluster 36: ['abuse', 'abuse potential']
Cluster 39: ['abuse, misuse and diversion', 'abuse, misuse, and diversion']
Cluster 41: ['academic difficulties', 'academic struggles']
Cluster 46: ['academic outcomes', 'academic performance']
Cluster 49: ['accidental injuries', 'accidental injury']
Cluster 57: ['activation of m

In [16]:
df_node_types = pd.DataFrame()
for n in G.nodes(data=True):
    df_node_types = df_node_types._append({'node': n[0], 'node_type': n[1]['node_type']}, ignore_index=True)

## Calculate communities for coloring the nodes

In [17]:
# communities_generator = nx.community.girvan_newman(G) # Girvan-Newman Community Detection

# next_level_communities = next(communities_generator)
# communities = sorted(map(sorted, next_level_communities))

# while len(communities[0]) > 256:
#     print("Number of nodes in the largest community: ", len(communities[0]))
#     next_level_communities = next(communities_generator)
#     communities = sorted(map(sorted, next_level_communities))

# print("Number of nodes in the largest community: ", len(communities[0]))
# print("Number of Communities = ", len(communities))
# print("Number of Nodes = ", len(G.nodes()))
# print(communities)

### Create a dataframe for community colors

In [18]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors

def colors2NodeTypes(nodes) -> pd.DataFrame:

    node_types = df_node_types['node_type'].unique()

    p = sns.color_palette(palette, len(node_types)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0

    for node_type in node_types:
        color = p.pop()
        group += 1
        for node in nodes:
            if df_node_types[df_node_types['node'] == node]['node_type'].iloc[0] == node_type:
                rows += [{"node": node, "color": color, "group": group}]

    df_colors = pd.DataFrame(rows)
    return df_colors

# colors = colors2Community(communities)
colors = colors2NodeTypes(list(G.nodes()))
colors

Unnamed: 0,node,color,group
0,$47.55 million,#dba757,1
1,"$47.55 million (hong et al., 2020)",#dba757,1
2,% patients with adhd with documentation of imp...,#dba757,1
3,a death in the family,#dba757,1
4,abnormalities of chromosomes,#dba757,1
...,...,...,...
2909,us college-aged women,#c7db57,11
2910,white-majority countries,#c7db57,11
2911,world,#c7db57,11
2912,youth and adults worldwide,#c7db57,11


## Export graph data to csv

In [19]:
dfg_temp = dfg[['node_1']].rename(columns={'node_1': 'node'})
dfg_temp = dfg_temp._append(dfg[['node_2']].rename(columns={'node_2': 'node'}))
dfg_temp.drop_duplicates(subset=['node'], inplace=True)

In [20]:
## Add nodes to the graph
for node in list(dfg_temp['node']):
    label = node_to_label[node]
    new_node = label_to_node[label]

    for index, row in dfg[dfg['node_1'] == node].iterrows():
        dfg.at[index, 'node_1'] = new_node

    for index, row in dfg[dfg['node_2'] == node].iterrows():
        dfg.at[index, 'node_2'] = new_node


final_df = dfg.copy()[['node_1', 'node_2', 'edge', 'edge_type', 'count']]
final_df.rename(columns={'node_1': 'source', 'node_2': 'target', 'count': 'value'}, inplace=True)
final_df['color'] = final_df['edge_type'].apply(lambda x: '#808080' if x == 'contextual proximity' else '#22dd22')
final_df.to_csv(outputdirectory/"finalgraph.csv", sep=",", index=False)
final_df.head()

Unnamed: 0,source,target,edge,edge_type,value,color
0,$47.55 million,adhd,exists is same context,contextual proximity,8,#808080
1,"$47.55 million (hong et al., 2020)",adhd,exists is same context,contextual proximity,8,#808080
2,$77.5 to $115.9 billion,adolescents and young adults with adhd,exists is same context,contextual proximity,2,#808080
3,$77.5 to $115.9 billion,substantial impairments,exists is same context,contextual proximity,2,#808080
4,$77.5 to $115.9 billion,traffic accidents,exists is same context,contextual proximity,2,#808080


In [21]:
metadata = df_node_types.copy()
metadata.rename(columns={'node': 'id', 'node_type': 'type'}, inplace=True)

category_list, categories = pd.factorize(df_node_types['node_type'])

metadata['type_id'] = category_list

metadata.to_csv(outputdirectory/"metadata.csv", sep=",", index=False)
metadata.head()

Unnamed: 0,id,type,type_id
0,$47.55 million,other,0
1,"$47.55 million (hong et al., 2020)",other,0
2,$77.5 to $115.9 billion,documents,1
3,% of patients medicated for adhd,documents,1
4,% patients evaluated with parent vanderbilt ra...,service,2


## Add colors to the graph & Visualize

In [19]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

for edge in G.edges(data=True):
    G.edges[edge[0], edge[1]]['color'] = ('#22dd22' if edge[2]['edge_title'] == 'relation' else '#808080')

In [20]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    font_color="#cccccc",
    filter_menu=False,
)


net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])
# net.toggle_physics(False)

net.show(graph_output_directory, notebook=False)

./docs/index.html


In [22]:
# nx.write_graphml(G, './data_output/KG.graphml', named_key_ids=True)

In [23]:
import leidenalg as la
import igraph as ig

iG = ig.Graph.from_networkx(G)
partition = la.find_partition(iG, la.CPMVertexPartition, resolution_parameter = 0.001) # Leiden Community Detection

In [None]:
from matplotlib import pyplot as plt

fig, ax = plt.subplots(figsize=(16, 16))

# ig.plot(partition, target=ax)
num_communities = len(partition)
palette1 = ig.RainbowPalette(n=num_communities)

# iG.vs["label"] = ["\n\n" + label for label in iG.vs["node_label"]]

def hex_to_rgb(hex):
  return [int(hex[i:i+2], 16)/255 for i in (1, 3, 5)]

ig.plot(
    partition,
    target=ax,
    mark_groups=True,
    # layout="fruchterman_reingold",
    # layout="kamada_kawai",
    palette=palette1,
    vertex_size=15,
    vertex_color=[hex_to_rgb(c) for c in iG.vs["color"]],
    edge_width=0.5,
)

# add legend to figure (node color to node type)
type_color_legend = set()

for t, c in zip(iG.vs["node_type"], iG.vs["color"]):
    type_color_legend.add((t, c))

ax.legend(
    handles=[plt.Rectangle((0, 0), 1, 1, color=c) for t, c in type_color_legend], 
    labels=[t for t, c in type_color_legend], 
    loc='upper left'
)

fig.set_size_inches(20, 20)

## Metrics

In [56]:
final_df = pd.read_csv(f"./data_output/{out_dir}/finalgraph.csv")
metadata = pd.read_csv(f"./data_output/{out_dir}/metadata.csv")


G = nx.from_pandas_edgelist(final_df, source="source", target="target", edge_attr=["edge_type","color"])
nx.set_node_attributes(G, metadata.set_index('id')['type'].to_dict(), 'node_type')
# nx.set_edge_attributes(G, final_df['color'].to_list(), 'color')
# nx.set_edge_attributes(G, final_df['value'].to_list(), 'weight')

# filter nodes with degree 1
G = G.subgraph([n for n in G.nodes() if G.degree(n) > 1])


adj_matrix = nx.to_numpy_array(G)

G.nodes(data=True)


NodeDataView({'adhd': {'node_type': 'condition'}, '$77.5 to $115.9 billion': {'node_type': 'documents'}, 'adolescents and young adults with adhd': {'node_type': 'person'}, 'substantial impairments': {'node_type': 'condition'}, 'traffic accidents': {'node_type': 'documents'}, '% of patients medicated for adhd ': {'node_type': 'documents'}, 'prescription refills are coordinated with other prescribers': {'node_type': 'concept'}, 'transition care coordinated for vacations and other times away': {'node_type': 'service'}, '% patients evaluated with parent vanderbilt rating scales': {'node_type': 'service'}, '% patients meeting dsm criteria for adhd': {'node_type': 'condition'}, '% patients treated with psychostimulant medication': {'node_type': 'documents'}, '% patients evaluated with teacher vanderbilt rating scales': {'node_type': 'service'}, '% patients on medication with at least 2 follow-up visits a year in record': {'node_type': 'service'}, '% patients seen within 30 days medication in

In [83]:
import leidenalg as la
import igraph as ig
import community as community_louvain



# Leiden Community Detection
iG = ig.Graph.from_networkx(G)
leiden_partitions = la.find_partition(iG, la.ModularityVertexPartition) 
leiden_modularity = leiden_partitions.modularity
print(leiden_modularity)

# Louvain Community Detection
louvain_partitions = community_louvain.best_partition(G)
louvain_modularity = community_louvain.modularity(louvain_partitions, G)
print(louvain_modularity)

# Girvan-Newman Community Detection
def girvan_newman_community_detection(G, max_iter=100):
    girvan_newman_generator = nx.community.girvan_newman(G)
    girvan_newman_partitions = next(girvan_newman_generator)
    girvan_newman_modularity = nx.community.quality.modularity(G, girvan_newman_partitions)
    best_modularity = girvan_newman_modularity
    best_partitions = sorted(map(sorted, girvan_newman_partitions))

    for _ in tqdm(range(max_iter)):
        try:
            girvan_newman_partitions = next(girvan_newman_generator)
            girvan_newman_modularity = nx.community.quality.modularity(G, girvan_newman_partitions)
            if girvan_newman_modularity > best_modularity:
                best_modularity = girvan_newman_modularity
                best_partitions = sorted(map(sorted, girvan_newman_partitions))
        except StopIteration:
            break

    return best_modularity, best_partitions, len(best_partitions)


girvan_newman_modularity, girvan_newman_partitions, girvan_newman_num_communities = girvan_newman_community_detection(G)
print(girvan_newman_modularity)

0.6195394880656903
0.6157166576390495


100%|██████████| 100/100 [6:44:49<00:00, 242.90s/it]   

0.5876393198282849





In [84]:
from sklearn.metrics.pairwise import cosine_similarity

network_metrics = {}


# Define the metrics to be calculated
metrics = [
    ('nodes', lambda G: G.number_of_nodes()),
    ('edges', lambda G: G.number_of_edges()),
    ('density', lambda G: nx.density(G)),
    # ('diameter', lambda G: nx.diameter(G)),
    ('triadic_closure', lambda G: nx.transitivity(G)),
    ('degree', lambda G: nx.degree(G)),
    ('avg_degree', lambda G: nx.average_degree_connectivity(G)),
    ('degree_centrality', lambda G: nx.degree_centrality(G)),
    ('betweenness_centrality', lambda G: nx.betweenness_centrality(G)),
    ('closeness_centrality', lambda G: nx.closeness_centrality(G)),
    ('eigenvector_centrality', lambda G: nx.eigenvector_centrality(G)),
    ('avg_clustering_coeff', lambda G: nx.average_clustering(G)),
    # ('average_shortest_path_length', lambda G: nx.average_shortest_path_length(G)),
    ('page_rank', lambda G: nx.pagerank(G)),
    ('avg_node_similarity', lambda G: np.mean(cosine_similarity(adj_matrix))),
    ('leiden_modularity', lambda G: leiden_modularity),
    ('leiden_num_communities', lambda G: len(set(leiden_partitions.membership))),
    ('louvain_modularity', lambda G: louvain_modularity),
    ('louvain_num_communities', lambda G: len(set(louvain_partitions.values()))),
    ('girvan_newman_modularity', lambda G: girvan_newman_modularity),
    ('girvan_newman_num_communities', lambda G: girvan_newman_num_communities),
]

# Calculate metrics with progress bar
for metric_name, metric_func in tqdm(metrics, desc="Calculating network metrics"):
    network_metrics[metric_name] = metric_func(G)

print(network_metrics)
# save output to a file
with open(outputdirectory/'network_metics.json', 'w') as f:
    f.write(str(network_metrics))

Calculating network metrics: 100%|██████████| 19/19 [00:42<00:00,  2.24s/it]

{'nodes': 2347, 'edges': 8655, 'density': 0.0031438076796084023, 'triadic_closure': 0.050460292145380975, 'degree': DegreeView({'adhd': 1130, '$77.5 to $115.9 billion': 3, 'adolescents and young adults with adhd': 80, 'substantial impairments': 8, 'traffic accidents': 8, '% of patients medicated for adhd ': 14, 'prescription refills are coordinated with other prescribers': 3, 'transition care coordinated for vacations and other times away': 3, '% patients evaluated with parent vanderbilt rating scales': 2, '% patients meeting dsm criteria for adhd': 7, '% patients treated with psychostimulant medication': 7, '% patients evaluated with teacher vanderbilt rating scales': 2, '% patients on medication with at least 2 follow-up visits a year in record': 2, '% patients seen within 30 days medication initiation': 2, '% patients whose medical record contains documentation of dsm criteria': 2, 'confirmation of impairment in 2+ settings': 2, 'seen within one month of first prescription': 3, 'scr




## Community Summaries

In [None]:
import ollama.client as client


def communitySummaryPrompt(nodes: str, edges: str, metadata={}, model="mistral-openorca:latest"):
    if model == None:
        model = "mistral-openorca:latest"

    SYS_PROMPT = """
        Your task is to generate a comprehensive summary of a knowledge graph community. 
        The context of the input includes nodes and relationships between the nodes. 
        Analyze and create a summary of the community's overall structure, how its entities are related to each other, and significant keypoints associated with its entities.
        Dont just list the nodes or edges, summarize the knowledge exists between these entities in one long paragraph.
        
        # Example Input
        
        ## Context:

        ### Entities:
        ABILA CITY PARK
        POK RALLY
        POK
        POKRALLY
        CENTRAL BULLETIN

        ### Relationships (node1,node2,relation,weight):
        ABILA CITY PARK,POK RALLY,Abila City Park is the location of the POK rally,1
        ABILA CITY PARK,POK,POK is holding a rally in Abila City Park,2
        ABILA CITY PARK,POKRALLY,The POKRally is taking place at Abila City Park,1
        ABILA CITY PARK,CENTRAL BULLETIN,Central Bulletin is reporting on the POK rally taking place in Abila City Park,1.5

        ## Output:

        The knowledge graph centers around the POK rally event. Abila City Park is the location where the POK rally is taking place. There seems to be a Pokemon (POK) involved in the rally, possibly the organizer. Central Bulletin is reporting on the event.

    """
    
    USER_PROMPT = """

        # Real Data

        Use the following context for your answer. 
        Dont make anything up in your answer. 

        ## Context:

        ### Entities:
        {nodes}

        ### Relationships (node1,node2,relation,weight):
        {edges}

        ##Output:
    """.format(nodes=nodes, edges=edges)

    response, _ = client.generate(model_name=model, system=SYS_PROMPT, prompt=USER_PROMPT)

    return response

In [None]:
# from helpers.prompts import communitySummaryPrompt
import csv

for community in tqdm(communities):
    # print(community)
    edges_list = []
    nodes_list = []
    print("Number of nodes under community", len(community))

    for node in community:
        nodes_list.append(node)
        edges_list += G.edges(node, data=True)


    nodes = "\n".join(nodes_list)
    edges = ""

    for edge in edges_list:
        # print(edge)
        edges += f"{edge[0]},{edge[1]},{edge[2]['edge_title']},{edge[2]['width']}\n"
    
    sum = communitySummaryPrompt(nodes, edges, model='adrienbrault/nous-hermes2pro-llama3-8b:q8_0')
    with open(outputdirectory/'community_summaries.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([sum])

    break
    

# edges, nodes

  0%|          | 0/437 [00:00<?, ?it/s]

Number of nodes under community 34

m.,faraone,contextual proximity,1.0
m.,garcia-argibay,contextual proximity,1.0
m.,haavik,contextual proximity,1.0
m.,hegvik,contextual proximity,1.0
m.,hess,contextual proximity,1.0
m.,hoogman,contextual proximity,1.0
m.,hou,contextual proximity,1.0
m.,j.,contextual proximity,2.0
m.,l.j.,contextual proximity,1.0
m.,j. t.,contextual proximity,1.0
m.,kroesbergen,contextual proximity,1.0
m.,m. a.,contextual proximity,1.0
m.,malik,contextual proximity,1.0
m.,michoel,contextual proximity,1.0
m.,mol. psychiatry,contextual proximity,1.0
m.,nigg,contextual proximity,1.0
m.,psychiatric genomics consortium,contextual proximity,1.0
m.,s.v.,contextual proximity,1.0
m.,stolte,Stolte and M. collaborate on a study examining creativity and ADHD, focusing on behavioral studies, psychostimulant effects, and neural underpinnings.,contextual proximity,2.0
m.,t.,contextual proximity,1.0
m.,t. a.,contextual proximity,1.0
m.,transl. psychiatry,contextual proximity,1.0
m. a

  0%|          | 0/437 [00:20<?, ?it/s]
