## Setup

In [2]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [3]:
import unstructured
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[1].page_content)


100%|██████████| 1/1 [00:06<00:00,  6.05s/it]

Number of chunks =  23
Categories: Public Health, Epidemiology/Public Health, Health Policy Keywords: working conditions, indian public health standards, auxiliary nurse midwives, human resource, health sector reform, india, health policy





## Create a dataframe of all the chunks

In [4]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(23, 3)


Unnamed: 0,text,source,chunk_id
0,Abstract India’s health indicators have improv...,data_input\cureus\cureus-0015-00000040274.txt,17729efad4ee414d8842f7cb96201e60
1,"Categories: Public Health, Epidemiology/Public...",data_input\cureus\cureus-0015-00000040274.txt,56c92a38751f4c77a173752097496dc4
2,Introduction And Background India’s health ind...,data_input\cureus\cureus-0015-00000040274.txt,2a6cb33c1d4d4405afeb890d6aabf75d
3,"An extensive literature search was performed, ...",data_input\cureus\cureus-0015-00000040274.txt,5525aea7b99d4dd18d6e67f45f0f85fd
4,Review Overview of the public and private heal...,data_input\cureus\cureus-0015-00000040274.txt,b8acecf8c5474ce684b48caa9c3eca26


## Extract Concepts

In [5]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [6]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

[
   {
       "node_1": "Universal health coverage",
       "node_2": "Skilled health workforce",
       "edge": "The paucity of skilled personnel in India's healthcare system is hindering the country's progress towards achieving universal health coverage."
   },
   {
       "node_1": "Health indicators",
       "node_2": "Skilled health workforce",
       "edge": "Improvement in India's health indicators is dependent on the augmentation of the skilled health workforce."
   },
   {
       "node_1": "WHO threshold",
       "node_2": "India",
       "edge": "India's estimated active health workers density falls significantly below the WHO threshold for doctors and nurses/midwives, indicating a need for urgent action."
   },
   {
       "node_1": "Public-private sector divide",
       "node_2": "Skilled health workforce",
       "edge": "The skewed inter-state, urban-rural, and public-private sector divide in the distribution of skilled health personnel is a compounding issue in India's h

Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,universal health coverage,skilled health workforce,The paucity of skilled personnel in India's he...,17729efad4ee414d8842f7cb96201e60,4
1,health indicators,skilled health workforce,Improvement in India's health indicators is de...,17729efad4ee414d8842f7cb96201e60,4
2,who threshold,india,India's estimated active health workers densit...,17729efad4ee414d8842f7cb96201e60,4
3,public-private sector divide,skilled health workforce,"The skewed inter-state, urban-rural, and publi...",17729efad4ee414d8842f7cb96201e60,4
4,health budget,skilled health workforce,The recent increase in the federal health budg...,17729efad4ee414d8842f7cb96201e60,4


## Calculating contextual proximity

In [7]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
2560,world health organization's recommended,private health sector,"b8acecf8c5474ce684b48caa9c3eca26,b8acecf8c5474...",3,contextual proximity
2570,world-class health facilities,government-funded health sector,"b8acecf8c5474ce684b48caa9c3eca26,b8acecf8c5474...",2,contextual proximity
2574,world-class health facilities,india,"08c0abd7b63b4174899710638c725fab,b8acecf8c5474...",3,contextual proximity
2579,world-class health facilities,private health sector,"b8acecf8c5474ce684b48caa9c3eca26,b8acecf8c5474...",3,contextual proximity
2580,world-class health facilities,public health sector (nhm),"08c0abd7b63b4174899710638c725fab,08c0abd7b63b4...",7,contextual proximity


### Merge both the dataframes

In [8]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,accredit health facilities,create and enforce new rules,"6f6d49e87c4940d4b49e8337b6238f25,6f6d49e87c494...",contextual proximity,2
1,accredit health facilities,enforce existing rules,"6f6d49e87c4940d4b49e8337b6238f25,6f6d49e87c494...",contextual proximity,2
2,accredit health facilities,government,"6f6d49e87c4940d4b49e8337b6238f25,6f6d49e87c494...",contextual proximity,4
3,accredit health facilities,health insurance scheme for central government...,"6f6d49e87c4940d4b49e8337b6238f25,6f6d49e87c494...",contextual proximity,2
4,accredit health facilities,limited rules,"6f6d49e87c4940d4b49e8337b6238f25,6f6d49e87c494...",contextual proximity,2
...,...,...,...,...,...
705,world health organization's recommended,private health sector,"b8acecf8c5474ce684b48caa9c3eca26,b8acecf8c5474...",contextual proximity,3
706,world-class health facilities,government-funded health sector,"b8acecf8c5474ce684b48caa9c3eca26,b8acecf8c5474...",contextual proximity,2
707,world-class health facilities,india,"08c0abd7b63b4174899710638c725fab,08c0abd7b63b4...",India has become a leading destination for med...,7
708,world-class health facilities,private health sector,"b8acecf8c5474ce684b48caa9c3eca26,b8acecf8c5474...",contextual proximity,3


## Calculate the NetworkX Graph

In [9]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(195,)

In [10]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [11]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  15
[['accredit health facilities', 'accurate data about practicing doctors', 'affordable, accessible, quality care', 'allocation to healthcare', 'auxiliary nurse midwives', 'better health for all indians', 'brazil', 'central government', 'china', 'communitization', 'consumers', 'consumes 5.1% of the gdp', 'corporate houses', 'create and enforce new rules', 'doctor-to-population ratio', 'domestic general government health expenditure', 'dominant provider of healthcare', 'enforce existing rules', 'epidemiology/public health', 'flexible financing', 'for-profit private health sector', 'gdp of india', 'government', 'government of india', 'government-funded health sector', 'health challenges', 'health insurance scheme for central government employees', 'health policy', 'health sector reform', 'healthcare', 'healthcare expenditure per citizen per year', 'human resource', 'improved management through capacity building', 'india', 'indian government', 'indian public heal

### Create a dataframe for community colors

In [12]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,accredit health facilities,#84db57,1
1,accurate data about practicing doctors,#84db57,1
2,"affordable, accessible, quality care",#84db57,1
3,allocation to healthcare,#84db57,1
4,auxiliary nurse midwives,#84db57,1
...,...,...,...
190,personnel retention,#57db94,13
191,training received positive evaluations,#dbc957,14
192,two-year intermittent training initiative,#dbc957,14
193,training schedules,#c957db,15


### Add colors to the graph

In [13]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [14]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
