## Setup

In [None]:
!pip install langchain

In [None]:
!pip install -U langchain-community

In [None]:
!pip install unstructured

In [None]:
!pip install libmagic

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [2]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


  0%|          | 0/1 [00:00<?, ?it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
100%|██████████| 1/1 [00:02<00:00,  2.01s/it]

Number of chunks =  23
policy research, consisting of readying one's materials, extracting the data, and analyzing it to distill the findings. An extensive literature search was performed, and 56 articles published in peer-reviewed journals between 2005 and 2021 were selected and analyzed. The corresponding authors' experiential knowledge served as the foundation for the analysis.





## Create a dataframe of all the chunks

In [None]:
!pip install yachalk

In [3]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(23, 3)


Unnamed: 0,text,source,chunk_id
0,Abstract India’s health indicators have improv...,data_input\cureus\cureus-0015-00000040274.txt,5f490ea1f9f34632a819815c56a0e21d
1,"Categories: Public Health, Epidemiology/Public...",data_input\cureus\cureus-0015-00000040274.txt,08ec3fd889ee49de99d90a7f9954eae1
2,Introduction And Background India’s health ind...,data_input\cureus\cureus-0015-00000040274.txt,40617619930b48adb78cb41d2eb26919
3,"policy research, consisting of readying one's ...",data_input\cureus\cureus-0015-00000040274.txt,0c9d45a719af411681da6f9296c50560
4,Review Overview of the public and private heal...,data_input\cureus\cureus-0015-00000040274.txt,dc410d30cc3746a480d294c29c64e5fa


## Extract Concepts

In [4]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [5]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='gpt-4.1-mini')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

(342, 5)


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,india,health indicators,India's health indicators have improved in rec...,5f490ea1f9f34632a819815c56a0e21d,4
1,india,population,India is a country with a population of 1.3 bi...,5f490ea1f9f34632a819815c56a0e21d,4
2,active health workers,doctors,Active health workers density includes doctors.,5f490ea1f9f34632a819815c56a0e21d,4
3,active health workers,nurses/midwives,Active health workers density includes nurses/...,5f490ea1f9f34632a819815c56a0e21d,4
4,doctors and nurses/midwives density,"5.0 and 6.0 respectively per 10,000 persons",The density of doctors and nurses/midwives in ...,5f490ea1f9f34632a819815c56a0e21d,4


## Calculating contextual proximity

In [6]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
12031,young and early career doctors,performance targets and practice constraints,"3af80fd9ec0042a8931dbf169083d0a7,3af80fd9ec004...",2,contextual proximity
12034,young and early career doctors,private sector,"3af80fd9ec0042a8931dbf169083d0a7,3af80fd9ec004...",3,contextual proximity
12037,émigré physician workforce,ayushman bharat,"db669c8a039e4adab1000ba092b6dceb,db669c8a039e4...",5,contextual proximity
12039,émigré physician workforce,doctors,"db669c8a039e4adab1000ba092b6dceb,db669c8a039e4...",2,contextual proximity
12052,émigré physician workforce,similar schemes,"db669c8a039e4adab1000ba092b6dceb,db669c8a039e4...",2,contextual proximity


### Merge both the dataframes

In [7]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,"0.576 physicians per 1,000 population in 2000",for-profit private health sector,"dc410d30cc3746a480d294c29c64e5fa,dc410d30cc374...",contextual proximity,4
1,"0.576 physicians per 1,000 population in 2000",government-funded health sector,"dc410d30cc3746a480d294c29c64e5fa,dc410d30cc374...",contextual proximity,3
2,"0.576 physicians per 1,000 population in 2000",india,"dc410d30cc3746a480d294c29c64e5fa,dc410d30cc374...",contextual proximity,2
3,"0.576 physicians per 1,000 population in 2000",national health mission (nhm),"dc410d30cc3746a480d294c29c64e5fa,dc410d30cc374...",contextual proximity,2
4,"0.7 public hospital beds per 100,000 people",for-profit private health sector,"dc410d30cc3746a480d294c29c64e5fa,dc410d30cc374...",contextual proximity,4
...,...,...,...,...,...
4707,young and early career doctors,private sector,"3af80fd9ec0042a8931dbf169083d0a7,3af80fd9ec004...",contextual proximity,3
4708,young and early career doctors,status and opportunity,3af80fd9ec0042a8931dbf169083d0a7,Most young and early career doctors face erosi...,4
4709,émigré physician workforce,ayushman bharat,"db669c8a039e4adab1000ba092b6dceb,db669c8a039e4...",contextual proximity,5
4710,émigré physician workforce,doctors,"db669c8a039e4adab1000ba092b6dceb,db669c8a039e4...",contextual proximity,2


## Calculate the NetworkX Graph

In [8]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(450,)

In [9]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [10]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  7
[['0.576 physicians per 1,000 population in 2000', '0.7 public hospital beds per 100,000 people', '1% of the gdp', '279 medical colleges', '37% had any health insurance coverage in 2018', '44.5 doctors, nurses, and midwives per 10,000 population', '5.0 and 6.0 respectively per 10,000 persons', 'accredit health facilities', 'accurate data about manpower quantity and geospatial location', 'active health workers', 'active health workers’ density', 'amplify care reach', 'article', 'auxiliary nurse midwives', 'better health for all indians', 'better living conditions', 'booming medical tourism industry and medical schools', 'broad overarching policy', 'care delivery and monitoring', 'central government', 'clinical and social skills', 'cognitive platform', 'competency-based training', 'complex multifactorial issue', 'consumes 5.1% of the gdp', 'contractual government posts', 'control of non-communicable disease risk factors', 'corporate hospitals', 'corporate house

### Create a dataframe for community colors

In [11]:
!pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


In [12]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,"0.576 physicians per 1,000 population in 2000",#57dbaa,1
1,"0.7 public hospital beds per 100,000 people",#57dbaa,1
2,1% of the gdp,#57dbaa,1
3,279 medical colleges,#57dbaa,1
4,37% had any health insurance coverage in 2018,#57dbaa,1
...,...,...,...
445,scale of such initiatives,#579bdb,7
446,supportive care in health facilities and homes,#579bdb,7
447,systemic integration pathways,#579bdb,7
448,three-year diploma course,#579bdb,7


### Add colors to the graph

In [13]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [15]:
!pip install pyvis

Defaulting to user installation because normal site-packages is not writeable
Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jsonpickle>=1.4.1 (from pyvis)
  Downloading jsonpickle-4.1.1-py3-none-any.whl.metadata (8.1 kB)
Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
   ---------------------------------------- 0.0/756.0 kB ? eta -:--:--
   --------------------------- ------------ 524.3/756.0 kB 3.1 MB/s eta 0:00:01
   ---------------------------------------- 756.0/756.0 kB 3.1 MB/s eta 0:00:00
Downloading jsonpickle-4.1.1-py3-none-any.whl (47 kB)
Installing collected packages: jsonpickle, pyvis

   ---------------------------------------- 0/2 [jsonpickle]
   -------------------- ------------------- 1/2 [pyvis]
   -------------------- ------------------- 1/2 [pyvis]
   -------------------- ------------------- 1/2 [pyvis]
   -------------------- ------------------- 1/2 [pyvis]
   -------------------- ------------------- 1/2 [pyvis]
   -------

In [18]:
import os
os.makedirs('./docs', exist_ok=True)
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
