# GraphRAG

### Installation of Libraries

In [1]:
# !pip install graphrag
# !pip install PyMuPDF
# !pip install PyPDF2
# !pip install openpyxl



In [1]:
import os
import subprocess
import pandas as pd
from PyPDF2 import PdfReader
import time
from datetime import datetime
import shutil
import nest_asyncio
import asyncio
nest_asyncio.apply()  # Allows nesting of event loops
import tiktoken

In [2]:

# Local Search Libraries
from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

# Global Search Libraries
import os
import pandas as pd
import tiktoken
from graphrag.query.indexer_adapters import read_indexer_entities, read_indexer_reports
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.global_search.community_context import (GlobalCommunityContext,)
from graphrag.query.structured_search.global_search.search import GlobalSearch

In [3]:
# !mkdir -p ./ragtest/input

### Settting Up the Workspace Variables

In [4]:
# !python -m graphrag.index --init --root ./ragtest

In [5]:
## For Generating custom automates Prompts
# !python -m graphrag.prompt_tune --root ./ragtest --domain "medical experimentsr research articles" --method random --limit 10 --max-tokens 2048 --chunk-size 256 --no-entity-types --output ./prompt_medical_experiments_research_articles

### Deleting the previous available data at output folder

In [6]:
# Define the directory to be deleted
dir_path = r'C:\Users\15011\Documents\Projects\GraphRAG\ragtest\output'

# Check if the directory exists
if os.path.exists(dir_path):
    # Delete the directory and all its contents
    shutil.rmtree(dir_path)
    print(f'Directory {dir_path} has been deleted successfully.')
else:
    print(f'Directory {dir_path} does not exist.')


Directory C:\Users\15011\Documents\Projects\GraphRAG\ragtest\output has been deleted successfully.


### Importing the data (.txt files) and Running the Indexing pipeline

In [7]:
# Define the directories
pdf_dir = r'C:\Users\15011\Documents\Projects\GraphRAG\ragtest\pdfs'
input_dir = r'C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input'
output_dir = r'C:\Users\15011\Documents\Projects\GraphRAG\ragtest\output'

# Create the input and output directories if they don't exist
os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

# Initialize an empty list to store the input file names
input_file_names = []

# Function to convert a PDF to a text file
def pdf_to_text(pdf_path, text_path):
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        with open(text_path, 'w', encoding='utf-8') as text_file:
            for page in pdf_reader.pages:
                text = page.extract_text()
                text_file.write(text)

# Delete existing files in the input directory
def clear_input_directory():
    for file in os.listdir(input_dir):
        file_path = os.path.join(input_dir, file)
        try:
            if os.path.isfile(file_path):
                os.remove(file_path)
                print(f"Deleted {file_path}")
            else:
                print(f"{file_path} is not a file.")
        except Exception as e:
            print(f"Error deleting {file_path}: {e}")

# Iterate over each PDF file in the PDF directory
for pdf_file_name in os.listdir(pdf_dir):
    if pdf_file_name.endswith('.pdf'):
        pdf_path = os.path.join(pdf_dir, pdf_file_name)

        # Clear the input directory
        clear_input_directory()

        # Convert the PDF file to a text file and save it in the input directory
        txt_file_name = os.path.splitext(pdf_file_name)[0] + '.txt'
        txt_path = os.path.join(input_dir, txt_file_name)
        pdf_to_text(pdf_path, txt_path)

        # Append the text file name to the list
        input_file_names.append(txt_file_name)
        # Wait to ensure unique timestamp-based names
        time.sleep(1)
        
        ## Run the command
        command = ['python', '-m', 'graphrag.index', '--root', './ragtest']
        subprocess.run(command)
        # !python -m graphrag.index --root ./ragtest
        
        # Wait to ensure unique timestamp-based names
        time.sleep(1)


Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\38137852.txt
Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\36340628.txt
Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\37685623.txt
Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\37713020.txt
Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\37719621.txt
Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\37720667.txt
Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\37736320.txt
Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\37736447.txt
Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\37772223.txt
Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\37779779.txt
Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\37791144.txt
Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\37868382.txt
Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input

In [8]:
# Get the list of output files and sort them by creation time
output_files = sorted(os.listdir(output_dir), key=lambda x: os.path.getctime(os.path.join(output_dir, x)))

# Ensure the number of input and output files are the same
if len(input_file_names) != len(output_files):
    print("Mismatch between number of input and output files.")
else:
    # Map input files to output files based on the order of processing
    file_data = [{'input_file': input_file, 'output_file': output_file} for input_file, output_file in zip(input_file_names, output_files)]

    # Create a DataFrame from the list
    txt_KnowledgeGraph_mapping = pd.DataFrame(file_data)

    # Display the DataFrame
    print(txt_KnowledgeGraph_mapping)
    
file_path = r'C:\Users\15011\Documents\Projects\GraphRAG\pubmeds\input_output_mapping.csv'
# Save the DataFrame to CSV
txt_KnowledgeGraph_mapping.to_csv(file_path, index=False)

      input_file      output_file
0   36340628.txt  20240819-120238
1   37685623.txt  20240819-120509
2   37713020.txt  20240819-120831
3   37719621.txt  20240819-121138
4   37720667.txt  20240819-121327
5   37736320.txt  20240819-121553
6   37736447.txt  20240819-121759
7   37772223.txt  20240819-121952
8   37779779.txt  20240819-122139
9   37791144.txt  20240819-122322
10  37868382.txt  20240819-122519
11  37902387.txt  20240819-122636
12  37908212.txt  20240819-122852
13  37908579.txt  20240819-123051
14  37965705.txt  20240819-123219
15  37970083.txt  20240819-123425
16  37977830.txt  20240819-123704
17  38020190.txt  20240819-123945
18  38022102.txt  20240819-124206
19  38022122.txt  20240819-124400
20  38033693.txt  20240819-124615
21  38043563.txt  20240819-124815
22  38046472.txt  20240819-124935
23  38046758.txt  20240819-125118
24  38050348.txt  20240819-125407
25  38054149.txt  20240819-125544
26  38058846.txt  20240819-125722
27  38075417.txt  20240819-125931
28  38077716.t

In [9]:
async def main(search_engine,prompt):
    result = await search_engine.asearch(prompt)
    return result

## Local Search

In [10]:
def local_search(output_dir,prompt):
    ## Load tables to dataframes
    INPUT_DIR = r"C:/Users/15011/Documents/Projects/GraphRAG/ragtest/output/" + output_dir + "/artifacts"
    LANCEDB_URI = f"{INPUT_DIR}/lancedb"
    
    COMMUNITY_REPORT_TABLE = "create_final_community_reports"
    ENTITY_TABLE = "create_final_nodes"
    ENTITY_EMBEDDING_TABLE = "create_final_entities"
    RELATIONSHIP_TABLE = "create_final_relationships"
    TEXT_UNIT_TABLE = "create_final_text_units"
    COMMUNITY_LEVEL = 1
    
    ## Read entities
    # read nodes table to get community and degree data
    entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
    entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

    entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

    # load description embeddings to an in-memory lancedb vectorstore
    # to connect to a remote db, specify url and port values.
    description_embedding_store = LanceDBVectorStore(
        collection_name="entity_description_embeddings",
    )
    description_embedding_store.connect(db_uri=LANCEDB_URI)
    entity_description_embeddings = store_entity_semantic_embeddings(
        entities=entities, vectorstore=description_embedding_store
    )

    # print(f"Entity count: {len(entity_df)}")
    # entity_df.head()
    
    ## Read relationships
    relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
    relationships = read_indexer_relationships(relationship_df)

    # print(f"Relationship count: {len(relationship_df)}")
    # relationship_df.head()
    
    # covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")
    # claims = read_indexer_covariates(covariate_df)
    # print(f"Claim records: {len(claims)}")
    # covariates = {"claims": claims}
    
    ## Read community reports
    report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
    reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

    # print(f"Report records: {len(report_df)}")
    # report_df.head()
    
    text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
    text_units = read_indexer_text_units(text_unit_df)
    # print(f"Text unit records: {len(text_unit_df)}")
    # text_unit_df.head()
    
    embedding_model = "text-embedding-3-large"
    api_key = "34a6e9e765d94d3c8a318337cbc122cd"
    llm_model = "gpt-4o"

    llm = ChatOpenAI(
        api_key=api_key,
        model=llm_model,
        api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
        max_retries=20,
        api_version="2024-02-15-preview",
        deployment_name="gpt-4o",
        api_base= "https://df-pocs-q1.openai.azure.com/"
    )

    token_encoder = tiktoken.get_encoding("cl100k_base")

    text_embedder = OpenAIEmbedding(
        api_key=api_key,
        api_base="https://df-pocs-q1.openai.azure.com/",
        api_type=OpenaiApiType.AzureOpenAI,
        model=embedding_model,
        deployment_name=embedding_model,
        max_retries=20,
        api_version="2024-02-01",
    )
    
    # ## Create local search context builder
    # context_builder = LocalSearchMixedContext(
    #     community_reports=reports,
    #     text_units=text_units,
    #     entities=entities,
    #     covariates=covariates,             # Removing this line
    #     relationships=relationships,
    #     entity_text_embeddings=description_embedding_store,
    #     embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    #     text_embedder=text_embedder,
    #     token_encoder=token_encoder,
    # )
    
    context_builder = LocalSearchMixedContext(
        community_reports=reports,
        text_units=text_units,
        entities=entities,
        relationships=relationships,
        entity_text_embeddings=description_embedding_store,
        embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
        text_embedder=text_embedder,
        token_encoder=token_encoder,
    )
    
    
    
    ## Create local search engine
    
    # text_unit_prop: proportion of context window dedicated to related text units
    # community_prop: proportion of context window dedicated to community reports.
    # The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
    # conversation_history_max_turns: maximum number of turns to include in the conversation history.
    # conversation_history_user_turns_only: if True, only include user queries in the conversation history.
    # top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
    # top_k_relationships: control the number of out-of-network relationships to pull into the context window.
    # include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
    # include_relationship_weight: if True, include the relationship weight in the context window.
    # include_community_rank: if True, include the community rank in the context window.
    # return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
    # could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
    # dataframes indicates whether the record is included in the context window.
    # max_tokens: maximum number of tokens to use for the context window.
    
    local_context_params = {
        "text_unit_prop": 0.5,
        "community_prop": 0.1,
        "conversation_history_max_turns": 0,             # changed
        "conversation_history_user_turns_only": False,   # changed
        "top_k_mapped_entities": 7,
        "top_k_relationships": 7,
        "include_entity_rank": True,
        "include_relationship_weight": True,
        "include_community_rank": True,                     # changed
        "return_candidate_context": False,
        "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
        "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    }

    llm_params = {
        "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
        "temperature": 0.0,
    }
    
    search_engine = LocalSearch(
        llm=llm,
        context_builder=context_builder,
        token_encoder=token_encoder,
        llm_params=llm_params,
        context_builder_params=local_context_params,
        response_type="multiple-page report",   # changed   # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
    )
    
    ## Run local search on sample queries
    # prompt="what is the country of address of the primary author? -please give in single word"
    
    result= asyncio.run(main(search_engine,prompt))
    # print("Result:\n",result.response)
    ## Inspecting the context data used to generate the response
    # print("entities: \n",result.context_data["entities"].head())
    # print("Relationships: \n",result.context_data["relationships"].head())
    # print("reports: \n",result.context_data["reports"].head())
    # print("sources: \n",result.context_data["sources"].head())
    return result


## Global Search

In [11]:
def global_search(output_dir,prompt):
    ### LLM setup
    api_key = "34a6e9e765d94d3c8a318337cbc122cd"
    llm_model = "gpt-4o"
    llm = ChatOpenAI(
        api_key=api_key,
        model=llm_model,
        api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
        max_retries=20,
        api_version="2024-02-15-preview",
        deployment_name="gpt-4o",
        api_base= "https://df-pocs-q1.openai.azure.com/"
    )

    token_encoder = tiktoken.get_encoding("cl100k_base")
    
    ### Load community reports as context for global search
    # parquet files generated from indexing pipeline
    INPUT_DIR = r"C:/Users/15011/Documents/Projects/GraphRAG/ragtest/output/" + output_dir + "/artifacts"
    COMMUNITY_REPORT_TABLE = "create_final_community_reports"
    ENTITY_TABLE = "create_final_nodes"
    ENTITY_EMBEDDING_TABLE = "create_final_entities"

    # community level in the Leiden community hierarchy from which we will load the community reports
    # higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
    COMMUNITY_LEVEL = 1
    
    entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
    report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
    entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

    reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
    entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
    # print(f"Report records: {len(report_df)}")
    # report_df.head()
    
    #### Build global context based on community reports
    context_builder = GlobalCommunityContext(
        community_reports=reports,
        entities=entities,  # default to None if you don't want to use community weights for ranking
        token_encoder=token_encoder,
    )
    
    #### Perform global search
    context_builder_params = {
        "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
        "shuffle_data": True,
        "include_community_rank": True,
        "min_community_rank": 0,
        "community_rank_name": "rank",
        "include_community_weight": True,
        "community_weight_name": "occurrence weight",
        "normalize_community_weight": True,
        "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
        "context_name": "Reports",
    }

    map_llm_params = {
        "max_tokens": 1000,
        "temperature": 0.0,
        "response_format": {"type": "json_object"},
    }

    reduce_llm_params = {
        "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
        "temperature": 0.0,
    }
    
    search_engine = GlobalSearch(
        llm=llm,
        context_builder=context_builder,
        token_encoder=token_encoder,
        max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
        map_llm_params=map_llm_params,
        reduce_llm_params=reduce_llm_params,
        allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
        json_mode=False,  # set this to False if your LLM model does not support JSON mode.
        context_builder_params=context_builder_params,
        concurrent_coroutines=32,
        response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
    )
    
    # prompt="what is the summary?"
    result= asyncio.run(main(search_engine,prompt))
    # print(result.response)
    
    # # inspect the data used to build the context for the LLM responses
    # result.context_data["reports"]
    
    # # inspect number of LLM calls and tokens
    # print(f"LLM calls: {result.llm_calls}. LLM tokens: {result.prompt_tokens}")
    return result

In [12]:
# Read the Excel file
Hetro_MAH_Region = pd.read_excel(r"C:\Users\15011\Documents\Projects\GraphRAG\pubmeds\Hetero MAH Regions.xlsx", sheet_name="Hetero MAH Regions")
# Hetro_MAH_Region.head()

# Extract the values from the "Regions" column into a list
regions_list = Hetro_MAH_Region["Regions"].tolist()

# Read the Excel file
pdf_drug = pd.read_excel(r"C:\Users\15011\Documents\Projects\GraphRAG\pubmeds\Literature Citations_pdf.xlsx", sheet_name="Invalids ")

# input-output files mapping
txt_KnowledgeGraph_mapping = pd.read_csv(r"C:\Users\15011\Documents\Projects\GraphRAG\pubmeds\input_output_mapping.csv")


In [13]:
results = pd.DataFrame(columns=[
    "PDF File", "Drug", "Patient Type", "Total Number of Patients", "Patient Validation",
    "Primary Author Address", "Primary Author Country", "Primary Author Address Validation",
    "Adverse Event", "Adverse Event Validation","Casualities", "Casualities Validation", "Status"
    ])
# pdf_name=input("Please enter the pdf name: ")
for _,row in txt_KnowledgeGraph_mapping.iterrows():
    # global_search(row['output_file'],prompt)
    
    print("\n\n|||||||||||||||||||||||||||||||||    File:"+row['input_file']+"     |||||||||||||||||||||||||||||||||")
    input_file = row['input_file']
    # Convert input file name to PDF name
    pdf_name = input_file[:-3] + "pdf"
    
    # Find the ACTIVE INGREDIENT based on the PDF name
    drug_info = pdf_drug.loc[pdf_drug["Pdf Name"] == pdf_name, "ACTIVE INGREDIENT"]
    if drug_info.empty:
        print("****************No Drug found*************")
        continue
    # Access the ACTIVE INGREDIENT value
    drug = drug_info.values[0]
    
    
    # i) To find the Patient is human or not --------------------------------
    # prompt = (
    # "i will provide you the sample responce, please **very Strictly** follow that heading formate- # Patient Validation ,   # Patient Type,  # Total Number of Patients "
    # "When no Patient is present then, please **very Strictly** follow that formate- # Patient Validation  Invalid,   # Patient Type   No patient found  # Total Number of Patients  0"
    # "So Carefully differentiate between patients and authors—do not mix up their names. "
    # "Please do not repeat the patients in total count and at the time of giving their information"
    # "also please differentiate each patient, if one patient is called at multiple places then please dont consider him as different person, and please do not consider the patientes present inside the table formate data(So kindly please check if the patient that you are getting is from table formate data), please kindly remember it and then give answer "
    # "Please consider only that patients who are related to {drug}, apart from this do not consider any other patients"
    # "Please determine if the patient is human who are related to {drug}. if the patient is human then under the heading '# Patient Validation' give answer as Valid and if not then give answer as Invalid, strictly there should be only Valid or Invalid present in answer nothing else"
    # "(only consider the single patient like (39-Year-Old Male ,39-Year-Old Female), and strictly dot not group of patients(eg:- 204 Patients, 6 patients, etc)) In the subsequent line, under the heading '# Total Number of Patients' provide the total number of patients present who are related to {drug}, if no single patient is present then give the answer 0"
    # "In the last line, under the heading '# Patient Type' list all the patients and include small and consise information about them who are related to {drug},and under this heading also include information of all the case of patients in above text, if no single patient is present then give the answer as no patient present."
    # "Ensure that all information is accurate and based on factual data present inside the text, and do not hallucinated responses, and do not give answers on your own that is not present in the text."
    # )
    prompt = """
    I will provide you with a sample response. Please **strictly** follow this format:
    # Patient Validation
    # Total Number of Patients
    # Patient Type

    If no patient is present, please **strictly** follow this format:
    # Patient Validation  Invalid
    # Total Number of Patients  0
    # Patient Type  No patient found

    Carefully differentiate between patients and authors—do not mix up their names. Do not repeat patients in the total count or when providing their information. Also, differentiate each patient. If one patient is mentioned in multiple places, do not consider them as different persons. Do not consider patients present in table format data. Remember to check if the patient information is from table format data and exclude such cases.
    Consider only patients related to {drug}. Do not consider any other patients.
    Please determine if the patient is human and related to {drug}. If the patient is human, under the heading '# Patient Validation' give the answer as Valid. If not, give the answer as Invalid. There should be only Valid or Invalid present in the answer, nothing else.
    (Only consider single patients like '39-Year-Old Male', '39-Year-Old Female', and strictly not groups of patients like '204 Patients', '6 patients', etc.). In the subsequent line, under the heading '# Total Number of Patients', provide the total number of patients present who are related to {drug}. If no single patient is present, give the answer as 0.
    In the last line, under the heading '# Patient Type', list all the patients and include small and concise information about them who are related to {drug}. Include information on all cases of patients in the above text. If no single patient is present, give the answer as 'No patient present.'
    Ensure that all information is accurate and based on factual data present in the text. Do not provide hallucinated responses or give answers that are not present in the text.
    """
    prompt = prompt.replace("{drug}", drug)

    # "Do strictly do not consider the group of patients(eg:- 204 Patients, 6 patients, etc),only consider the single patients present like only consider those patients who are present in the formate---39-Year-Old Male ,39-Year-Old Female 21-Year-Old male,etc...--"
    
    human=local_search(row['output_file'],prompt)
    Patient_Validation = human.response.split('# Patient Validation')[1].split('# Total Number of Patients')[0].strip() if '# Patient Validation' in human.response and '# Total Number of Patients' in human.response else "Pattern not found."
    Total_Number_of_Patients = human.response.split('# Total Number of Patients')[1].split('# Patient Type')[0].strip() if '# Total Number of Patients' in human.response and '# Patient Type' in human.response else "Pattern not found."
    Patient_Type = human.response.split('# Patient Type', 1)[1].strip() if '# Patient Type' in human.response else "Pattern not found."
    
    if "Invalid" in Patient_Validation:
        Patient_Validation="Invalid"
    elif "Valid" in Patient_Validation:
        Patient_Validation="Valid"
    
    # print("i) Human:- ",human.response)
    print(human.response)
    print("\n-------------------------------------------------------------------------\n",
          Patient_Validation,
          "\n-------------------------------------------------------------------------\n",
          Total_Number_of_Patients,
          "\n----------------------------------------------------------------\n",
          Patient_Type,
          "\n==============================================================\n")
    
    # ii) To find the country of primary Author --------------------------------
    prompt = (
    "I will provide you the sample response. Please **strictly** follow this heading format:"
    "## Primary Author Address"
    "## Primary Author Country"
    "## Primary Author Address validation"
    "If the address is not found, then in the response please **strictly** follow this format:"
    "## Primary Author Address  Unable to find Primary Author Address"
    "## Primary Author Country  Unable to find Primary Author Country"
    "## Primary Author Address validation  Invalid"
    "Remember most of the times the address can be in formate that i will be giving right now so please carefully consider that"
    "(example 1:- Libia Vasquez1  Tiffany Cortes2,3     1Department of Medicine, Division of Endocrinology, University of Texas Health Science Center at San Antonio, San Antonio, TX 78229  2Sam and Ann Barshop Institute for Longevity and Aging Studies, University of Texas Health Science Center at San Antonio, San Antonio, TX 78229, USA; it means  Libia Vasquez is the primary author and the address of primary author is Department of Medicine, Division of Endocrinology, University of Texas Health Science Center at San Antonio, San Antonio, TX 78229; and country is USA;   so work properly at situation also.)"
    " (example 2:- Pugazhendi Inban1 | Virali Gulla2 | Aarfa Devani3 | Chinaza Mercy Akuma4 | Chengala Ananyaa Gowthavaram5 |  1Department of General Medicine, Government Medical College, Omandurar, Chennai, India 2Internal Medicine, Sri Padmavathi Medical College, SVIMS, Tirupati, India 3Department of Internal Medicine, Malla Reddy Institute of Medical Sciences, Hyderabad, India 4Chamberlain University, College of Health Professions, Chicago, Illinois, USA 5Internal Medicine, Malla Reddy Institute of Medical Sciences, Hyderabad, Telangana, India ; so here the primary author is Pugazhendi Inban1 and his address is 1Department of General Medicine, Government Medical College, Omandurar, Chennai, India)"
    "If the extracted country name is one of the following specific countries, return the exact country name as listed below:"
    "Specific Countries:"
    "Algeria, Argentina, Armenia, Australia, Azerbaijan, Bahrain, Bangladesh, Belarus, Benin, Bhutan, Botswana, Brazil, Burkina Faso, "
    "Cambodia, Cameroon, Canada, Chile, China, Colombia, Congo Brazzaville, Costa Rica, Cuba, Denmark, Dominican Republic, DR Congo, Dubai, "
    "Ecuador, El Salvador, Ethiopia, France, Gabon, Germany, Ghana, Guatemala, Honduras, Hong Kong, India, Indonesia, Iran, Italy, Ivory Coast, "
    "Kazakhstan, Kenya, Kyrgyzstan, Madagascar, Malawi, Malaysia, Mali, Mauritius, Mexico, Moldova, Mongolia, Mozambique, Myanmar, Namibia, Nepal, "
    "Netherlands, Nicaragua, Niger, Nigeria, Norway, Oman, Pakistan, Panama, Paraguay, Peru, Philippines, Poland, Portugal, Russia, Rwanda, Saudi Arabia, "
    "Senegal, Singapore, South Africa, Spain, Sri Lanka, Sudan, Sweden, Syria, Chad, Thailand, Taiwan, Tajikistan, Tanzania, Togo, Trinidad & Tobago, "
    "Turkmenistan, UAE, Uganda, Ukraine, United Kingdom, Uruguay, USA, Uzbekistan, Venezuela, Vietnam, Yemen, Zambia, Zimbabwe, Puerto Rico"
    "If the extracted country name is not one of these, return the name exactly as it is extracted."
    "In the first line of the response, provide the entire address of the primary author under the heading '## Primary Author Address'. Please kindly scan the full text file to get this as the address may not always be present in the top passages of the text."
    "In the second line of the response, provide the country of the primary author under the heading '## Primary Author Country'."
    "At the end, under the heading '## Primary Author Address validation', if the primary author is from the list of Specific Countries, then give the answer as Valid. If not, give the answer as Invalid. There should be only Valid or Invalid present in the answer, nothing else."
    "Ensure that all information is accurate and based on factual data present inside the text. Do not provide hallucinated responses or give answers that are not present in the text."
    )

    
    Address=local_search(row['output_file'],prompt)
    Primary_Author_Address = Address.response.split('## Primary Author Address')[1].split('## Primary Author Country')[0].strip() if '## Primary Author Address' in Address.response and '## Primary Author Country' in Address.response else "Pattern not found."
    Primary_Author_Country = Address.response.split('## Primary Author Country')[1].split('## Primary Author Address validation')[0].strip() if '## Primary Author Country' in Address.response and '## Primary Author Address validation' in Address.response else "Pattern not found."
    Primary_Author_Address_validation = Address.response.split('## Primary Author Address validation', 1)[1].strip() if '## Primary Author Address validation' in Address.response else "Pattern not found."
    if "Invalid" in Primary_Author_Address_validation:
        Primary_Author_Address_validation="Invalid"
    elif "Valid" in Primary_Author_Address_validation:
        Primary_Author_Address_validation="Valid"
    
    # print("ii) Primary Author Country:- \n",Address.response)
    print(Address.response)
    print("\n-------------------------------------------------------------------------\n",
          Primary_Author_Address,
          "\n-------------------------------------------------------------------------\n",
          Primary_Author_Country,
          "\n----------------------------------------------------------------\n",
          Primary_Author_Address_validation,
          "\n==============================================================\n")
    
    ## iii) To find the adverse effects of the drug --------------------------------
    
    
    prompt = (
    f"I will provide you the sample response. Please **strictly** follow this heading format:\n\n"
    "### Adverse Effect of {drug}"
    "### Adverse Effect Validation"
    "please do not very strictly and forcefully remember that need not consider any other heading other than ### Adverse Effect of {drug}"
    "At the prompt below, I will be asking about the adverse effect of {drug}. So strictly only provide adverse effects with respect to the particular {drug} only. Do not provide adverse effects of any other drug other than {drug}. If {drug} is not present or if it does not have adverse effects, then in the answer give:"
    "-drug not found-"
    "-no adverse effect found-"
    "If there is no drug with the name {drug}, then give:"
    "### Adverse Effect of {drug}  This is not a drug"
    "### Adverse Effect Validation Invalid"
    "If {drug} is not present in the text, then in the response please **strictly** follow this format:"
    "### Adverse Effect of {drug}   {drug} not found"
    "### Adverse Effect Validation Invalid"
    "If {drug} has no adverse effect, then in the response please **strictly** follow this format:"
    "### Adverse Effect of {drug}   {drug} has no adverse effect"
    "### Adverse Effect Validation Invalid"
    "Only if {drug} is present, then provide the adverse effects of the {drug} in the text provided and its reason. If {drug} is not present or if it does not have adverse effects, then in the answer give:"
    "-drug not found-"
    "-no adverse effect found-"
    "Under the heading ### Adverse Effect  of {drug}, keep the answer to the point and concisely described."
    "Only if {drug} is present, at the end under the heading ### Adverse Effect Validation, if {drug} has adverse effects give the answer as Valid. If not having adverse effects, then give Invalid. There should be only Valid or Invalid present in the answer, nothing else."
    "Please ensure that the answer you provide is accurate and present inside the text provided. Strictly avoid providing any hallucinated responses or answers that are not present inside the provided text."
    "Please very strictly and forcefully remember not to provide adverse effects of any other drug other than {drug}, even if that drug is present in the text. I need adverse effects of only and only {drug}. If not present, then tell that no adverse effect and in validation give as Invalid. Please follow this strictly as you have not been following it repeatedly."
    "A additional case to remember, If {drug} is responsible for worsening the patient condition then it comes under the adverse effect of {drug}."

    )
    prompt = prompt.replace("{drug}", drug)

    print("\n\n",drug,"\n")
    ad_eff_of_drug= "### Adverse Effect of "+drug
    adverse_effect=local_search(row['output_file'],prompt)
    Adverse_Effect_of_drug = adverse_effect.response.split(ad_eff_of_drug)[1].split('### Adverse Effect Validation')[0].strip() if ad_eff_of_drug in adverse_effect.response and '### Adverse Effect Validation' in adverse_effect.response else "Pattern not found."
    Adverse_Effect_Validation = adverse_effect.response.split('### Adverse Effect Validation', 1)[1].strip() if '### Adverse Effect Validation' in adverse_effect.response else "Pattern not found."
    
    if "Invalid" in Adverse_Effect_Validation:
        Adverse_Effect_Validation="Invalid"
    elif "Valid" in Adverse_Effect_Validation:
        Adverse_Effect_Validation="Valid"
        
    response_drug=adverse_effect.response.split("### Adverse Effect of ")[1].split()[0]
    print("\n\n",response_drug,"\n")
    
        
    # print("iii) Adverse Affects of the drug ",drug," are:\n",adverse_effect.response)
    print(adverse_effect.response)
    print("\n-------------------------------------------------------------------------\n",
          Adverse_Effect_of_drug,
          "\n-------------------------------------------------------------------------\n",
          Adverse_Effect_Validation,
          "\n==============================================================\n")
    
    
    ## iv) To find the Casualities of the drug --------------------------------
    
    
    prompt = (
    f"I will provide you the sample response heading formate. Please **strictly** follow this heading format:\n\n"
    "### Casualities of {drug}"
    "### Casualities Validation"
    "please do not very strictly and forcefully remember that need not consider any other heading other than ### Casualities of {drug}"
    "Casualties related to drugs only refer to those involving human patients, and these casualties should be reported by the primary author based on the treatment or experiment they conducted. If the causality is reported by the primary author but is not related to any treatment or experiment, do not consider that causality."
    "At the prompt below, I will be asking about the Casualities of {drug}. So strictly only provide Casualities with respect to the particular {drug} only. Do not provide Casualities of any other drug other than {drug}. If {drug} is not present or if it does not have Casualities, then in the answer give:"
    "-drug not found-"
    "-no Casualities found-"
    "If there is no drug with the name {drug}, then give:"
    "### Casualities of {drug}  This is not a drug"
    "### Casualities Validation Invalid"
    "If {drug} is not present in the text, then in the response please **strictly** follow this format:"
    "### Casualities of {drug}   {drug} not found"
    "### Casualities Validation Invalid"
    "If {drug} has no adverse effect, then in the response please **strictly** follow this format:"
    "### Casualities of {drug}   {drug} has no Casualities"
    "### Casualities Validation Invalid"
    "Only if {drug} is present, then provide the adverse effects of the {drug} in the text provided and its reason. If {drug} is not present or if it does not have adverse effects, then in the answer give:"
    "-drug not found-"
    "-no Casualities found-"
    "Under the heading ### Casualities  of {drug}, keep the answer to the point and concisely described."
    "Only if {drug} is present, at the end under the heading ### Casualities Validation, if {drug} has Casualities give the answer as Valid. If not having Casualities, then give Invalid. There should be only Valid or Invalid present in the answer, nothing else."
    "Please ensure that the answer you provide is accurate and present inside the text provided. Strictly avoid providing any hallucinated responses or answers that are not present inside the provided text."
    "Please very strictly and forcefully remember not to provide Casualities of any other drug other than {drug}, even if that drug is present in the text. I need Casualities of only and only {drug}. If not present, then tell that no Casualities and in validation give as Invalid. Please follow this strictly as you have not been following it repeatedly."
    "A additional case to remember, If {drug} is responsible for worsening the patient condition during the treatment then it comes under the Casualities of {drug}."

    )
    prompt = prompt.replace("{drug}", drug)

    print("\n\n",drug,"\n")
    casuality_of_drug= "### Casualities of "+drug
    Casualities=local_search(row['output_file'],prompt)
    Casualities_of_drug = Casualities.response.split(casuality_of_drug)[1].split('### Casualities Validation')[0].strip() if casuality_of_drug in Casualities.response and '### Casualities Validation' in Casualities.response else "Pattern not found."
    Casualities_Validation = Casualities.response.split('### Casualities Validation', 1)[1].strip() if '### Casualities Validation' in Casualities.response else "Pattern not found."
    
    if "Invalid" in Casualities_Validation:
        Casualities_Validation="Invalid"
    elif "Valid" in Casualities_Validation:
        Casualities_Validation="Valid"
        
    response_drug=Casualities.response.split("### Casualities of ")[1].split()[0]
    print("\n\n",response_drug,"\n")
    
        
    # print("iii) Adverse Affects of the drug ",drug," are:\n",adverse_effect.response)
    print(Casualities.response)
    print("\n-------------------------------------------------------------------------\n",
          Casualities_of_drug,
          "\n-------------------------------------------------------------------------\n",
          Casualities_Validation,
          "\n==============================================================\n")
    
    
##----------------------------------------------------------------------------------------------------------------------------------------------------------------------    
    # Determine status
    Status = "Valid" if Patient_Validation == "Valid" and Primary_Author_Address_validation == "Valid" and Adverse_Effect_Validation == "Valid" and Casualities_Validation=="Valid" else "Invalid"
    
    
    # Create a DataFrame for the current result and concatenate it with the results DataFrame
    current_result = pd.DataFrame([{
        "PDF File": pdf_name,
        "Drug": drug,
        "Patient Type": Patient_Type,
        "Total Number of Patients": Total_Number_of_Patients,
        "Patient Validation": Patient_Validation,
        "Primary Author Address": Primary_Author_Address,
        "Primary Author Country": Primary_Author_Country,
        "Primary Author Address Validation": Primary_Author_Address_validation,
        "Adverse Event": Adverse_Effect_of_drug,
        "Adverse Event Validation": Adverse_Effect_Validation,
        "Casualities": Casualities_of_drug,
        "Casualities Validation":Casualities_Validation,
        "Status": Status
    }])
    
    results = pd.concat([results, current_result], ignore_index=True)
    
# Save DataFrame to Excel
results.to_excel(r"C:\Users\15011\Documents\Projects\GraphRAG\pubmeds\results.xlsx", index=False)




|||||||||||||||||||||||||||||||||    File:36340628.txt     |||||||||||||||||||||||||||||||||
# Patient Validation
Valid

# Total Number of Patients
2

# Patient Type
- 73-year-old male patient with a history of Guillain-Barre syndrome, hyperlipidemia, persistent nonvalvular atrial fibrillation, and obesity. He was anticoagulated with dabigatran for persistent atrial fibrillation and continued its use during his COVID-19 treatment. Despite this, he developed extensive arterial and venous thromboembolisms [Data: Entities (9); Relationships (22, 9, 19, 29, 30, 27, 28, 31, 32)].
- Patient with a BMI of 33.3 kg/m² who did not receive any concomitant medications that could have explained changes in the systemic concentration of dabigatran. The patient had no risk factors for increased risk of clinically important gastrointestinal bleeding [Data: Entities (28, 29); Relationships (23, 24, 42, 43, 44, 45, 46, 47)].

-------------------------------------------------------------------------
 Va

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


# Patient Validation  Invalid
# Total Number of Patients  0
# Patient Type  No patient found

-------------------------------------------------------------------------
 Invalid 
-------------------------------------------------------------------------
 0 
----------------------------------------------------------------
 No patient found 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


## Primary Author Address
Department of Medicine, Division of Endocrinology, University of Texas Health Science Center at San Antonio, San Antonio, TX 78229

## Primary Author Country
USA

## Primary Author Address validation
Valid

-------------------------------------------------------------------------
 Department of Medicine, Division of Endocrinology, University of Texas Health Science Center at San Antonio, San Antonio, TX 78229 
-------------------------------------------------------------------------
 USA 
----------------------------------------------------------------
 Valid 



 amoxicillin trihydrate 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)




 amoxicillin 

### Adverse Effect of amoxicillin trihydrate

amoxicillin trihydrate not found

### Adverse Effect Validation

Invalid

-------------------------------------------------------------------------
 amoxicillin trihydrate not found 
-------------------------------------------------------------------------
 Invalid 



 amoxicillin trihydrate 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)




 amoxicillin 

### Casualities of amoxicillin trihydrate

amoxicillin trihydrate not found

### Casualities Validation

Invalid

-------------------------------------------------------------------------
 amoxicillin trihydrate not found 
-------------------------------------------------------------------------
 Invalid 



|||||||||||||||||||||||||||||||||    File:37713020.txt     |||||||||||||||||||||||||||||||||
# Patient Validation  Invalid
# Total Number of Patients  0
# Patient Type  No patient found

-------------------------------------------------------------------------
 Invalid 
-------------------------------------------------------------------------
 0 
----------------------------------------------------------------
 No patient found 

## Primary Author Address
Department of Urology, The University of Oklahoma Health Sciences Center, Oklahoma City, OK, USA

## Primary Author Country
USA

## Primary Author Address validation
Valid

----------------------------------------

In [14]:

txt_KnowledgeGraph_mapping 

Unnamed: 0,input_file,output_file
0,36340628.txt,20240819-120238
1,37685623.txt,20240819-120509
2,37713020.txt,20240819-120831
3,37719621.txt,20240819-121138
4,37720667.txt,20240819-121327
5,37736320.txt,20240819-121553
6,37736447.txt,20240819-121759
7,37772223.txt,20240819-121952
8,37779779.txt,20240819-122139
9,37791144.txt,20240819-122322


In [15]:
print(adverse_effect.response)


### Adverse Effect of Allopurinol

Allopurinol is associated with several adverse effects, particularly in the context of DRESS syndrome (Drug Reaction with Eosinophilia and Systemic Symptoms). It has been identified as a trigger for DRESS syndrome, leading to severe reactions and complications such as kidney damage and respiratory distress. In one specific case, allopurinol's direct toxicity contributed to the patient's renal damage and subsequent death [Data: Entities (28, 71); Relationships (20, 51, 79)].

### Adverse Effect Validation

Valid
