## Setup

In [None]:
!pip install langchain

In [None]:
!pip install -U langchain-community

In [None]:
!pip install unstructured

In [None]:
!pip install libmagic

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [2]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


  0%|          | 0/1 [00:00<?, ?it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
100%|██████████| 1/1 [00:17<00:00, 17.26s/it]

Number of chunks =  23
policy research, consisting of readying one's materials, extracting the data, and analyzing it to distill the findings. An extensive literature search was performed, and 56 articles published in peer-reviewed journals between 2005 and 2021 were selected and analyzed. The corresponding authors' experiential knowledge served as the foundation for the analysis.





## Create a dataframe of all the chunks

In [None]:
!pip install yachalk

In [3]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(23, 3)


Unnamed: 0,text,source,chunk_id
0,Abstract India’s health indicators have improv...,data_input\cureus\cureus-0015-00000040274.txt,bdc817b6959847fbb40743f9ff16998b
1,"Categories: Public Health, Epidemiology/Public...",data_input\cureus\cureus-0015-00000040274.txt,c11f8819566243cab7df152d63051847
2,Introduction And Background India’s health ind...,data_input\cureus\cureus-0015-00000040274.txt,7e741fc9e6354dac96570afb1f709dd2
3,"policy research, consisting of readying one's ...",data_input\cureus\cureus-0015-00000040274.txt,c549bcf39b33469ea817dc1acc0db5a9
4,Review Overview of the public and private heal...,data_input\cureus\cureus-0015-00000040274.txt,5ce58c8d0a21413398ea4128bd39e902


## Extract Concepts

In [4]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [5]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='gpt-4.1-mini')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

(334, 5)


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,india,health indicators,India's health indicators have improved but st...,bdc817b6959847fbb40743f9ff16998b,4
1,india,population of 1.3 billion,India is a country with a population of 1.3 bi...,bdc817b6959847fbb40743f9ff16998b,4
2,active health workers density,doctors and nurses/midwives,Active health workers density in India include...,bdc817b6959847fbb40743f9ff16998b,4
3,active health workers density,"5.0 and 6.0 respectively per 10,000 persons",The density of doctors and nurses/midwives is ...,bdc817b6959847fbb40743f9ff16998b,4
4,india,"who threshold of 44.5 doctors, nurses, and mid...",India's active health workers density is much ...,bdc817b6959847fbb40743f9ff16998b,4


## Calculating contextual proximity

In [6]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
11053,world-class health facilities,world health organization,"5ce58c8d0a21413398ea4128bd39e902,5ce58c8d0a214...",2,contextual proximity
11064,young and early career doctors,india,"7cdeca8fa8ee4a8ab309fcc6de187fda,7cdeca8fa8ee4...",2,contextual proximity
11066,young and early career doctors,medical colleges,"7cdeca8fa8ee4a8ab309fcc6de187fda,7cdeca8fa8ee4...",2,contextual proximity
11067,young and early career doctors,performance targets and practice constraints,"7cdeca8fa8ee4a8ab309fcc6de187fda,7cdeca8fa8ee4...",2,contextual proximity
11068,young and early career doctors,private sector,"7cdeca8fa8ee4a8ab309fcc6de187fda,7cdeca8fa8ee4...",3,contextual proximity


### Merge both the dataframes

In [7]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,"0.576 physicians per 1,000 population in 2000",5.1% of the gdp,"5ce58c8d0a21413398ea4128bd39e902,5ce58c8d0a214...",contextual proximity,2
1,"0.576 physicians per 1,000 population in 2000",for-profit private health sector,"5ce58c8d0a21413398ea4128bd39e902,5ce58c8d0a214...",contextual proximity,3
2,"0.576 physicians per 1,000 population in 2000",government-funded health sector,"5ce58c8d0a21413398ea4128bd39e902,5ce58c8d0a214...",contextual proximity,3
3,"0.576 physicians per 1,000 population in 2000",india,"5ce58c8d0a21413398ea4128bd39e902,5ce58c8d0a214...",contextual proximity,2
4,"0.576 physicians per 1,000 population in 2000",individual informal provider clinics,"5ce58c8d0a21413398ea4128bd39e902,5ce58c8d0a214...",contextual proximity,2
...,...,...,...,...,...
4484,young and early career doctors,erosion of status and opportunity,7cdeca8fa8ee4a8ab309fcc6de187fda,The majority of young and early career doctors...,4
4485,young and early career doctors,india,"7cdeca8fa8ee4a8ab309fcc6de187fda,7cdeca8fa8ee4...",contextual proximity,2
4486,young and early career doctors,medical colleges,"7cdeca8fa8ee4a8ab309fcc6de187fda,7cdeca8fa8ee4...",contextual proximity,2
4487,young and early career doctors,performance targets and practice constraints,"7cdeca8fa8ee4a8ab309fcc6de187fda,7cdeca8fa8ee4...",contextual proximity,2


## Calculate the NetworkX Graph

In [8]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(428,)

In [9]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [10]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  5
[['0.576 physicians per 1,000 population in 2000', '0.7 public hospital beds per 100,000 people', '1% of the gdp', '1.28% of the gdp', '2005', '2022', '275,000', '279 medical colleges', '37% of the population', '5.0 and 6.0 respectively per 10,000 persons', '5.1% of the gdp', 'accredit health facilities', 'accurate data about quantity and geospatial location of manpower', 'active health workers density', "active health workers' density in india", 'affordable, accessible, quality care', 'ai-embedded algorithms', 'application of technology in health sector', 'article', 'ashas', "authors' experiential learning", 'auxiliary nurse midwives', 'ayush', 'ayushman bharat', 'behavior modification', 'better health for all indians', 'booming medical tourism industry and medical schools', 'bridge courses', 'broad overarching policy', 'capacity to utilize funds', 'care for economically weaker sections', 'cause of concern', 'central government', 'centralizing control of med

### Create a dataframe for community colors

In [None]:
!pip install seaborn

In [11]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,"0.576 physicians per 1,000 population in 2000",#b9db57,1
1,"0.7 public hospital beds per 100,000 people",#b9db57,1
2,1% of the gdp,#b9db57,1
3,1.28% of the gdp,#b9db57,1
4,2005,#b9db57,1
...,...,...,...
423,national health policy 2017,#5784db,5
424,oop expenditures,#5784db,5
425,population fraction,#5784db,5
426,poverty due to healthcare expenses,#5784db,5


### Add colors to the graph

In [12]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [None]:
!pip install pyvis

In [13]:
import os
os.makedirs('./docs', exist_ok=True)
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
