## Setup

In [102]:
# ! pip install python-magic-bin langchain-community python-magic
!pip install "unstructured[pdf]"




In [103]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [104]:
# Dir PDF Loader
loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader(".\data_input\cureus\Yoga.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


100%|██████████| 2/2 [00:01<00:00,  1.95it/s]

Number of chunks =  39
BRANCHES OF AYURVEDA:

Ayurveda is divided into eight components; these components are:

❖ Kaya Chikitsa (Internal Medicine) ❖ Bala Chikitsa (Treatment of Children / Pediatrics) ❖ Graha Chikitsa (Demonology / Psychology) ❖ Urdhvaanga Chikitsa (Treatment of disease above the clavicle) ❖ Shalya Chikitsa (Surgery) ❖ Agada Tantra / Visha Chikitsa: Toxicology ❖ Jara / Rasayana Tantra: Geriatrics

1

❖ Vajikarana: Reproductive sciences

TREATMENT MODALITIES:

Nidan Parivarjan(Avoidance of the disease-causing and aggravating factors)

Nidan Parivarjan is to avoid the disease-causing factors in the diet and lifestyle of the patients. It encompasses the idea of refraining from the precipitating or aggravating factors of the disease.

Shamana Therapy (Palliative Treatment)

Shamana therapy suppresses vitiated humour (Doshas). The process by which vitiated humour subsides or returns to normal without creating an imbalance of other humours is known as Shamana. This treatment




## Create a dataframe of all the chunks

In [105]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(39, 3)


Unnamed: 0,text,source,chunk_id
0,AYURVEDA\n\nINTRODUCTION\n\nAyurveda is the sc...,data_input\cureus\AYURVEDA.pdf,7b0299dc89bb4a5c9c95e53e1c5186ea
1,"According to Ayurveda, everything in Universe ...",data_input\cureus\AYURVEDA.pdf,f4ab7571bc644ab1900704b11583b2ed
2,"that governs metabolism is called Agni, which ...",data_input\cureus\AYURVEDA.pdf,ce9c6686673c402fb76461862d6eabbc
3,BRANCHES OF AYURVEDA:\n\nAyurveda is divided i...,data_input\cureus\AYURVEDA.pdf,eb955a6fd22f4a5bbc11fa067d3493a6
4,Satvavajaya (Psychotherapy)\n\nSatvavajaya tre...,data_input\cureus\AYURVEDA.pdf,3752c074bfa5433c967f963c0ae696c3


## Extract Concepts

In [106]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [107]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

[
   {
       "node_1": "positive health",
       "node_2": "Dosha Dhatu Samya",
       "edge": "Positive health is defined as Dosha Dhatu Samya in Ayurveda, which represents a well-balanced metabolism."
   },
   {
       "node_1": "positive health",
       "node_2": "prasanna Atma Indriya Manah",
       "edge": "Positive health also includes prasanna Atma Indriya Manah, which refers to a happy state of the soul, senses and mind in Ayurveda."
   },
   {
       "node_1": "Dharma",
       "node_2": "positive health",
       "edge": "In Ayurveda, attaining Dharma is not possible without sound positive health."
   },
   {
       "node_1": "Artha",
       "node_2": "positive health",
       "edge": "Similarly, achieving Artha, which refers to prosperity and wealth, is also dependent on positive health in Ayurveda."
   },
   {
       "node_1": "Kama",
       "node_2": "positive health",
       "edge": "Lastly, Kama or pleasures are also related to positive health in Ayurveda as a healthy bod

Unnamed: 0,node_1,node_2,edge,chunk_id,node_3,edge_1,count
0,positive health,dosha dhatu samya,Positive health is defined as Dosha Dhatu Samy...,7b0299dc89bb4a5c9c95e53e1c5186ea,,,4
1,positive health,prasanna atma indriya manah,Positive health also includes prasanna Atma In...,7b0299dc89bb4a5c9c95e53e1c5186ea,,,4
2,dharma,positive health,"In Ayurveda, attaining Dharma is not possible ...",7b0299dc89bb4a5c9c95e53e1c5186ea,,,4
3,artha,positive health,"Similarly, achieving Artha, which refers to pr...",7b0299dc89bb4a5c9c95e53e1c5186ea,,,4
4,kama,positive health,"Lastly, Kama or pleasures are also related to ...",7b0299dc89bb4a5c9c95e53e1c5186ea,,,4


## Calculating contextual proximity

In [116]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
5050,yogaposenet,bhattacharyya et al. (2021),"544f3b1d67064b9b972a17a8b7d887e8,544f3b1d67064...",2,contextual proximity
5051,yogaposenet,international conference on computer vision an...,"544f3b1d67064b9b972a17a8b7d887e8,544f3b1d67064...",2,contextual proximity
5052,yogaposenet,real-time detection,"544f3b1d67064b9b972a17a8b7d887e8,544f3b1d67064...",2,contextual proximity
5053,yogaposenet,recognition,"544f3b1d67064b9b972a17a8b7d887e8,544f3b1d67064...",2,contextual proximity
5054,yogaposenet,yoga pose detection,"544f3b1d67064b9b972a17a8b7d887e8,544f3b1d67064...",4,contextual proximity


### Merge both the dataframes

In [117]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,3d landmark data,blaze pose model,"ecffc8050b5d45c0979134853a8107fc,ecffc8050b5d4...",Produced by Blaze pose model without requiring...,6
1,3d landmark data produced by blaze pose model,blaze pose model,"ecffc8050b5d45c0979134853a8107fc,ecffc8050b5d4...",contextual proximity,2
2,99.49%,cnns,"5df2d37aa501404bad76ac24582b0ecc,5df2d37aa5014...",contextual proximity,2
3,99.49%,monitoring yoga positions,"5df2d37aa501404bad76ac24582b0ecc,5df2d37aa5014...",contextual proximity,2
4,99.70%,cnns,"5df2d37aa501404bad76ac24582b0ecc,5df2d37aa5014...",contextual proximity,2
...,...,...,...,...,...
1833,yogaposenet,bhattacharyya et al. (2021),"544f3b1d67064b9b972a17a8b7d887e8,544f3b1d67064...",contextual proximity,2
1834,yogaposenet,international conference on computer vision an...,"544f3b1d67064b9b972a17a8b7d887e8,544f3b1d67064...",contextual proximity,2
1835,yogaposenet,real-time detection,"544f3b1d67064b9b972a17a8b7d887e8,544f3b1d67064...",YogaPoseNet is a deep learning network develop...,6
1836,yogaposenet,recognition,"544f3b1d67064b9b972a17a8b7d887e8,544f3b1d67064...",YogaPoseNet is also capable of recognizing yog...,6


## Calculate the NetworkX Graph

In [118]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(350,)

In [119]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [120]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  34
[['3d landmark data', '3d landmark data produced by blaze pose model', 'blaze pose model', 'deep learning and machine learning models', 'existing pose estimation frameworks', 'media pipe blaze pose model', 'real-time yoga pose detection', 'real-time yoga/fitness applications', 'single person detections', 'xgboost classifier'], ['99.49%', '99.70%', 'asanas', 'body configuration estimation', "calculates each yoga pose's probability for the current image sequence", 'cnn', 'cnn and lstm models', 'cnn architecture', 'cnns', 'colour picture', 'confusion matrix', 'cross-platform media pipe library', 'database', 'deep learning', 'deep learning model', "discussion of approach's limitations", 'epipolar posture', 'exponential function of to determine the probability of each yoga pose', 'extracting key points', 'features', 'forecasting asanas', 'general positions', 'half-moon pose', 'highest probability', 'joints', 'json file containing joint position values', 'key poin

### Create a dataframe for community colors

In [121]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,3d landmark data,#c457db,1
1,3d landmark data produced by blaze pose model,#c457db,1
2,blaze pose model,#c457db,1
3,deep learning and machine learning models,#c457db,1
4,existing pose estimation frameworks,#c457db,1
...,...,...,...
345,rgb images,#db57db,32
346,popularization of yoga practice,#7e57db,33
347,self-study yoga programme,#7e57db,33
348,yoga pose classification,#9557db,34


### Add colors to the graph

In [122]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [123]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
