# Connect and fetch data from Neo4j DB

In [1]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.25.0-py3-none-any.whl.metadata (5.7 kB)
Downloading neo4j-5.25.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.6/296.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.25.0


In [17]:
from neo4j import GraphDatabase
from collections import defaultdict

In [14]:
# Import necessary libraries
import pandas as pd

# Load the CSV file from Google Drive (adjust the path to match your file location)
pd_data = pd.read_csv("https://raw.githubusercontent.com/Addika1630/RAG_Medical_System/refs/heads/main/drugs-indication.csv")

# Display the first few rows of the dataset to understand the structure
print(pd_data.head())

# Filter the relevant columns: 'name', 'primarySubstabce', 'doseAmount', 'doseUnit', 'indication'
# These columns are relevant for drug-disease prediction
drug_data = pd_data[['name', 'indication', 'doseAmount']]

# Display the filtered data
print("\nFiltered Data for Drug, Dose, and Disease Prediction:")
print(drug_data.head())

# RAG Analysis (Retrieve, Augment, Generate):
# Group the drugs by their indications to understand the drug-disease relationships

# Using .iterrows() to iterate over DataFrame rows
data = [(row["name"], row["indication"],  row["doseAmount"])
                           for _, row in drug_data.iterrows()]

# Display the grouped data
print("\nDrug Indication Grouped Data:")
print(data)


   primaryid  drugSequence               role           name primarySubstabce  \
0  100654764             1    Primary Suspect       AFINITOR       EVEROLIMUS   
1  100654764             2  Secondary Suspect      SORAFENIB        SORAFENIB   
2  100654764             3        Concomitant     CLONAZEPAM       CLONAZEPAM   
3  100654764             4        Concomitant  DEXAMETHASONE    DEXAMETHASONE   
4  100654764             5        Concomitant         KEPPRA    LEVETIRACETAM   

  route doseAmount doseUnit    indication  
0  Oral          5       MG  Glioblastoma  
1  Oral        400       MG  Glioblastoma  
2  Oral        0.5       MG       Seizure  
3  Oral          4       MG       Unknown  
4  Oral        100       MG       Unknown  

Filtered Data for Drug, Dose, and Disease Prediction:
            name    indication doseAmount
0       AFINITOR  Glioblastoma          5
1      SORAFENIB  Glioblastoma        400
2     CLONAZEPAM       Seizure        0.5
3  DEXAMETHASONE       Unk

In [15]:
data

[('AFINITOR', 'Glioblastoma', '5'),
 ('SORAFENIB', 'Glioblastoma', '400'),
 ('CLONAZEPAM', 'Seizure', '0.5'),
 ('DEXAMETHASONE', 'Unknown', '4'),
 ('KEPPRA', 'Unknown', '100'),
 ('LISINOPRIL', 'Unknown', '20'),
 ('METOPROLOL SUCCINATE', 'Unknown', '25'),
 ('ONDANSETRON', 'Unknown', '8'),
 ('VIMPAT', 'Seizure', '200'),
 ('SENOKOT', 'Unknown', '1'),
 ('Belimumab', 'Systemic lupus erythematosus', '10'),
 ('PREDNISONE', 'Systemic lupus erythematosus', '5'),
 ('NEUPOGEN', 'Autoimmune neutropenia', '300'),
 ('VANCOMYCIN', 'Infection', '1'),
 ('FLAGYL', 'Infection', '500'),
 ('VORICONAZOLE', 'Infection', '250'),
 ('REVLIMID', 'Plasma cell myeloma', '15'),
 ('DECADRON', 'Plasma cell myeloma', '20'),
 ('LYRICA', 'Neuralgia', '200'),
 ('FABRAZYME', "Fabry's disease", '70'),
 ('PREDNISONE', 'Premedication', '30'),
 ('HYDROXYZINE', 'Premedication', '50'),
 ('LYRICA', 'Diabetic neuropathy', '50'),
 ('LYRICA', 'Pain in extremity', '75'),
 ('METOPROLOL', 'Unknown', '100'),
 ('RANITIDINE', 'Dyspepsia'

# Transform and Group

In [18]:
context_data = defaultdict(list)
for name, indication, doseAmount in data:
    try:
        # Attempt to convert doseAmount to float
        context_data[name].append((indication, float(doseAmount)))
    except ValueError:
        # Handle the error (skip or log, depending on requirements)
        pass


grouped_data = {}
for name, details in context_data.items():
    total_doseAmount = sum(doseAmount for _, doseAmount in details)
    indications = [indication for indication, _ in details]
    grouped_data[name] = {"total_doseAmount": total_doseAmount, "indications": indications}

print(grouped_data)



{'AFINITOR': {'total_doseAmount': 48.5, 'indications': ['Glioblastoma', 'Astrocytoma, low grade', 'Seizure', 'Unknown', 'Unknown', 'Astrocytoma, low grade', 'Unknown']}, 'SORAFENIB': {'total_doseAmount': 400.0, 'indications': ['Glioblastoma']}, 'CLONAZEPAM': {'total_doseAmount': 34.125, 'indications': ['Seizure', 'Anxiety', 'Unknown', 'Unknown', 'Anxiety', 'Unknown', 'Schizoaffective disorder', 'Insomnia', 'Generalised anxiety disorder', 'Depression', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Muscle spasticity', 'Unknown', 'Unknown', 'Anxiety', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Anxiety', 'Unknown', 'Drug withdrawal syndrome', 'Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown']}, 'DEXAMETHASONE': {'total_doseAmount': 2587.25, 'indications': ['Unknown', 'Plasma cell myeloma', 'Plasma cell myeloma', 'Plasma cell myeloma', 'Plasma cell myeloma', 'Plasma cell myeloma', 'Plasma cell myeloma', 'Plasma cell myeloma', 'Unknown', 'Plasma cell myeloma', 'Unk

In [19]:
# Changing the dict data into Graph like representation
!pip install networkx




In [None]:
import networkx as nx
G = nx.Graph()

# Add nodes and edges from the dictionary
for name, details in grouped_data.items():
    G.add_node(name, total_=details["total_doseAmount"])
    for indication in details["indications"]:
        G.add_edge(name, indication)

# Check the created graph
print("Nodes of graph:", G.nodes(data=True))
print("Edges of graph:", G.edges())

Nodes of graph: [('AFINITOR', {'total_': 48.5}), ('Glioblastoma', {}), ('Astrocytoma, low grade', {}), ('Seizure', {}), ('Unknown', {}), ('SORAFENIB', {'total_': 400.0}), ('CLONAZEPAM', {'total_': 34.125}), ('Anxiety', {}), ('Schizoaffective disorder', {}), ('Insomnia', {}), ('Generalised anxiety disorder', {}), ('Depression', {}), ('Muscle spasticity', {}), ('Drug withdrawal syndrome', {}), ('DEXAMETHASONE', {'total_': 2587.25}), ('Plasma cell myeloma', {}), ('Nausea', {}), ('Premedication', {}), ('Plasma cell myeloma refractory', {}), ('Spinal cord oedema', {}), ('Acute lymphocytic leukaemia', {}), ('B precursor type acute leukaemia', {}), ('Prophylaxis', {}), ('Pancytopenia', {}), ('Oral candidiasis', {}), ('T-cell type acute leukaemia', {}), ('Chemotherapy side effect prophylaxis', {}), ('Oedema', {}), ('Prophylaxis of nausea and vomiting', {}), ('Diffuse large B-cell lymphoma', {}), ('Cytokine release syndrome', {}), ('Swelling', {}), ('Encephalopathy', {}), ('Stomatitis', {}), ('

# Embeddings

In [20]:
!pip install node2vec
!pip install langchain-community
!pip install llama-index
!pip install sentence-transformers

Collecting node2vec
  Downloading node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Downloading node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Installing collected packages: node2vec
Successfully installed node2vec-0.5.0
Collecting langchain-community
  Downloading langchain_community-0.3.3-py3-none-any.whl.metadata (2.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.4.0,>=0.3.4 (from langchain-community)
  Downloading langchain-0.3.4-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.12 (from langchain-community)
  Downloading langchain_core-0.3.12-py3-none-any.whl.metadata (6.3 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from langchain-community)
  Downloading langsmith-0.1.136-py3-none-any.whl.metadata (13 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.0-py3-none-any.whl.metadata (3.5 kB)

In [21]:
# alternative embedding
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from llama_index.legacy.embeddings.langchain import LangchainEmbedding

embed_model = LangchainEmbedding(
  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/legacy/_static/nltk_cache...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/legacy/_static/nltk_cache...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [25]:
# grouped_data

# Embed player names with additional context
embeddings = {}
for name, details in grouped_data.items():
    context = f"{name} drug taken a total of {details['total_doseAmount']} MG for the disease: {', '.join(details['indications'])}"
    embeddings[name] = embed_model.get_text_embedding(context)

# Connect to pinecone

In [22]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0.

In [None]:
# Initialize Pinecone
import pinecone
import numpy as np
from pinecone import ServerlessSpec # Import ServerlessSpec


pine_api = "a4228145-908e-43ee-b253-d2f5aac490c3"

pinecone_client = pinecone.Pinecone(api_key=pine_api)

index_name = "nba-players-index"

# Use the Pinecone instance to create the index:
if index_name not in pinecone_client.list_indexes().names():
    pinecone_client.create_index(index_name, dimension=5000, spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            ))

# Use the Pinecone instance to get the index:
index = pinecone_client.Index(index_name)

# Upsert embeddings to Pinecone
for name, vector in embeddings.items():
    # Check if vector is a NumPy array before calling tolist()
    # If it's not a NumPy array and it's not already a list, convert it to a list
    if not isinstance(vector, (np.ndarray, list)):
        vector = [vector] # Convert to a list if it's a single element
    elif isinstance(vector, np.ndarray):
        vector = vector.tolist() # Convert numpy array to list

    index.upsert([(name, vector)])  # Upsert with the correctly formatted vector

# Check the generated embeddings
for name, vector in embeddings.items():
    print(f"Drug: {name}, Embedding: {vector}")

Drug: AFINITOR, Embedding: [0.024311762303113937, -0.06367511302232742, -0.0010476558236405253, 0.025406965985894203, -0.02088025026023388, 0.02430085651576519, -0.009035693481564522, 0.05276792496442795, -0.03232476860284805, -0.006731313653290272, -0.009265695698559284, -0.05646095424890518, -0.016049664467573166, 0.039822109043598175, 0.03137044236063957, -0.008909833617508411, -0.015653938055038452, 0.011411038227379322, -0.008204864338040352, -0.05569601431488991, -0.0039023892022669315, 0.0122147835791111, -0.06100861728191376, 0.04327470064163208, -0.00904870219528675, 0.00697094202041626, 0.061469629406929016, -0.05847135931253433, 0.009658711962401867, -0.01001179963350296, 0.01789437048137188, -0.04600659757852554, -0.0235008392482996, 0.025965893641114235, 1.3864872698832187e-06, -0.015791431069374084, 0.024742865934967995, 0.03134501725435257, 0.046973925083875656, -0.059578798711299896, 0.003597918199375272, 0.0019974657334387302, -0.011400604620575905, -0.0050796098075807

In [27]:
# using the same function as in the demo
def get_data_from_db(query):
    # embed query
    xq = embed_model.get_text_embedding(query)
    # query database
    result = index.query(vector=xq, top_k=3, includeMetadata=True)
    matches = []
    for i in result['matches']:
      # filter by score
      if (i['score'] > 0.6):
        # print(i['id'])
        # append best results
        matches.append(i['id'])
    return str(matches)

In [None]:
mat = index.query(vector=embed_model.get_text_embedding("What are the teams Lebron James played against?"), top_k=3, includeMetadata=True)

In [None]:
mat['matches'][0]['id']

'LeBron James'

# LLM using together AI

In [28]:

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are an AI model tasked with querying a graph database stored in Pinecone. Use the context from the natural languge prompt to query the database and generate the results.
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a retrieve information related to the context.
Do not include any text except the generated output statement.
"""
# DEFAULT_SYSTEM_PROMPT = """Task: Answer querying questions based on the context provided.
# Instructions:
# Use only the provided relationship types and properties in the context.
# Do not use any other relationship types or properties that are not provided. Follow the semantics used in the context.

# Note: Do not include any explanations or apologies in your responses.
# Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
# Do not include any text except the generated output statement.

# Important: In the generated output, you must explicitly include the property values used in the query's filtering condition, alongside the main information requested from the original question.

# """

SYSTEM_PROMPT = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS

def get_prompt(instruction):
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [32]:
def format_prompt(query, context):
    return '''
    ### Texts:
    {context}

    ### Question:
    {query}
    '''.format(context=context, query=query)

In [29]:
!pip install together

Collecting together
  Downloading together-1.3.3-py3-none-any.whl.metadata (11 kB)
Downloading together-1.3.3-py3-none-any.whl (68 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.1/68.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: together
Successfully installed together-1.3.3


In [30]:
from together import Together
together_api = "0e34e3e9f142636b1161180f3f79fdcb4a0aaeeb31efb34e5e1b821be772fe27"


client = Together(api_key=together_api)

# Define a function that runs the model
def answer(query):
    context = get_data_from_db(query)

    prompt = format_prompt(query, context)
    prompt_template = get_prompt(prompt)

    stream = client.chat.completions.create(
    model="mistralai/Mixtral-8x7B-Instruct-v0.1",
    messages=[{"role": "user", "content": prompt_template}],
    )

    print(stream.choices[0].message.content.replace('```', '').strip())

# stream=True,
    # for chunk in stream:
    #     print(chunk.choices[0].delta.content or "", end="", flush=True)
    # return generate(prompt_template)

In [33]:
answer('How much MG instructions did EVEROLIMUS treat against the Glioblastoma ?')

The total prescribed dosage of Everolimus for treating Glioblastoma is 10 mg per day.


In [34]:
answer('What are the instruction for the Seizure that Concomitant treated?')

The instructions for the seizure concomitant treatment are:

- Brivaracetam: Specific dosage adjustments for seizure concomitant treatment are not mentioned in the database.
- Lacosamide: Specific dosage adjustments for seizure concomitant treatment are not mentioned in the database.
- Vimtec (another name for Vintedge in some regions): Specific dosage adjustments for seizure concomitant treatment are not mentioned in the database.

Please consult a healthcare professional for more detailed information.


# Possible queries