In [69]:
import os
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import glob
import openai

In [36]:
with open('diseases_data.json','r',encoding='utf-8') as file:
    data=json.load(file) #Load the data from the JSON file.

if not os.path.exists('documents'):
    os.makedirs('documents') #create the documents folder if it doesnt exist

for disease, details in data.items(): #iterate over the diseases and create a file for each
    file_path = os.path.join('documents', f'{disease}.json') #create the file path

    with open(file_path, 'w', encoding='utf-8') as file: #write the data to the file
        json.dump(details, file, indent=2, ensure_ascii=False)

print('Files created successfully in the documents folder')

Files created successfully in the documents folder


In [37]:
disease,details

('Sepsis',
 {'symptoms': ['Fever',
   'Rapid breathing',
   'Confusion',
   'Rapid heart rate',
   'Low blood pressure'],
  'medicines': ['Antibiotics',
   'Intravenous fluids',
   'Vasopressors',
   'Corticosteroids'],
  'precautions': ['Timely treatment of infections',
   'Good hygiene',
   'Monitor wounds',
   'Regular checkups']})

In [38]:
model = SentenceTransformer('all-MiniLM-L6-v2') #loading the model 'all-MiniLM-L6-v2'
#SentenceTransformer is used to train and use embedding models for text, image, & sentences

print(model.max_seq_length)
# 'model.max_seq_length' means the max sequence length the model can process, any more than that will be truncated
model.max_seq_length = 512

def get_embedding_new(text):

    embeddings = model.encode([text], normalize_embeddings=True)
    return embeddings
    #'model.encode([text])' takes a list of input sentences and returns their corresponding embeddings
    #text are coverted into tokens and it is encoded into embeddings
    #The embeddings are returned as a list of vectors, where each vector represents the semantic meaning of the entire sentence

256


In [24]:
get_embedding_new('what is AI')

array([[-2.82005984e-02, -1.76754873e-02,  1.15960659e-02,
         1.60157476e-02,  4.57059033e-02, -1.61521696e-02,
         6.50133640e-02,  3.18059772e-02,  2.92788967e-02,
         5.82549199e-02, -4.29296307e-02, -3.12392041e-03,
         9.42629389e-03, -4.22834530e-02, -4.40775901e-02,
         7.15260878e-02, -2.44307648e-02, -4.12630588e-02,
        -7.00530708e-02, -7.44963288e-02, -1.32851060e-02,
         8.89817532e-03, -5.68898246e-02, -7.09727928e-02,
        -3.66556682e-02,  9.12975892e-02,  2.88419444e-02,
        -7.82777518e-02, -1.05021326e-02, -1.82267465e-02,
         1.75545625e-02, -6.11509122e-02,  8.68319124e-02,
         1.52493622e-02, -4.01249193e-02,  2.64513344e-02,
        -4.38397788e-02, -4.86194007e-02,  9.56171080e-02,
        -2.18671933e-02, -1.49999168e-02, -4.06117216e-02,
         1.71279777e-02, -6.96665645e-02,  1.04702055e-01,
         1.24466650e-01, -7.75910318e-02, -1.82315949e-02,
         3.11944056e-02,  4.61744517e-02, -1.20331831e-0

In [59]:
vectors = []

for i in glob.glob('documents/*'): #retrives all the file path inside the documents folder
                                   #loop will iterate each file (i) in the folder
    with  open(i, 'r', encoding='utf-8') as file:
        data = json.load(file) #loads a a python dictionary using json.load()
        #print(i, data)

        name = i.split('/')[-1].split('.')[0]

        tem_vector = get_embedding_new(str(name) + ' ' + str(data))[0]

        tem = {}
        tem['name'] = name
        tem['vector'] = tem_vector
        vectors.append(tem)
    

if the file name with directory is "documents\Alzheimer's Disease.json", the code "name = i.split('/')[-1].split('.')[0]",
.split('/') splits the full file path (i) using '/' as the delimiter and takes the last part '[-1] =>  "Alzheimer's Disease.json"
.split('.')[0] splits the file name using the period '.' as the delimiter and selects the first part '[0]' which is the name without .json => "Alzheimer's Disease"


In [60]:
pd.DataFrame(vectors)

Unnamed: 0,name,vector
0,documents\Alzheimer's Disease,"[-0.022632454, 0.055501748, -0.02472528, 0.002..."
1,documents\Cancer,"[-0.045800738, 0.13394901, 0.0055460995, 0.003..."
2,documents\Chronic Kidney Disease,"[-0.044403937, 0.06515253, 0.045573656, -0.035..."
3,documents\Chronic Obstructive Pulmonary Diseas...,"[-0.039502975, 0.08145586, 0.012173744, -0.005..."
4,documents\Diabetes,"[-0.059672743, 0.08384945, -0.016839892, 0.031..."
5,documents\Heart Disease,"[-0.034746688, 0.09326087, -0.020781772, 0.016..."
6,documents\Hepatitis C,"[-0.02712322, 0.07097507, -0.022254994, -0.029..."
7,documents\HIV-AIDS,"[-0.075142935, 0.11961165, 0.0017979178, -0.01..."
8,documents\Leukemia,"[-0.090527326, 0.16922571, 0.018265579, -0.042..."
9,documents\Liver Cirrhosis,"[0.009077264, 0.033712246, 0.00663782, -0.0488..."


In [61]:
query = 'I am suffering with memory loss what could be the reason ?'

get_embedding_new(query)[0]

array([ 1.06671557e-01, -6.09368039e-03, -5.17527126e-02,  6.40313402e-02,
        3.27426158e-02,  8.01704079e-02,  3.99261042e-02,  8.87169167e-02,
        8.56384411e-02,  2.57856864e-02, -3.13594192e-02, -6.35541463e-03,
        1.79537432e-03,  2.28441451e-02,  1.66948047e-02, -1.67123489e-02,
       -7.42528588e-02,  1.00299850e-01, -6.76572025e-02, -1.91177689e-02,
       -3.00903921e-03,  4.80014225e-03, -2.42267270e-02,  8.68807584e-02,
       -7.31659010e-02,  6.32333606e-02, -3.76599543e-02, -1.04064025e-01,
        2.79128347e-02, -5.70549816e-02,  1.00144647e-01, -1.74282293e-03,
       -9.16084126e-02, -4.31983545e-03,  2.66051814e-02,  1.00641124e-01,
       -1.21889874e-01, -1.82597730e-02, -1.02787828e-02, -2.18453091e-02,
       -5.05238101e-02,  8.34037364e-02, -3.98385935e-02,  2.07809918e-03,
        2.26776376e-02,  3.07231937e-02, -1.23659102e-02,  4.04462637e-03,
        5.41697554e-02, -1.09508429e-02, -1.23036867e-02,  1.24218818e-02,
        1.29921958e-02, -

In [None]:
def get_similar_documents(query):

    emb = get_embedding_new(query)

    df = pd.DataFrame(vectors)
    df['distance'] = df['vector'].apply(lambda x: np.sum((x-emb)**2)) #Applying eucliden squared distance to find the values
    files = list( i + '.json' for i in df.sort_values('distance')['name'].head(4)) #the top 4 closest value is taken

    context = []
    for file in files:
        with open(file , 'r', encoding = 'utf-8') as i: #open each file for reading
            data = json.load(i) #load the content of json file into the data variable

            new_dict = {}
            new_dict['disease_name'] = file.split('.')[0]
            new_dict['disease_info'] = data
            print(new_dict)
            context.append(new_dict)
    return str(context)
    

In [65]:
question = 'i am suffering with memory loss what could be the reason ?'
context = get_similar_documents(question)
context

{'disease_name': "documents\\Alzheimer's Disease", 'disease_info': {'symptoms': ['Memory loss', 'Difficulty with problem-solving', 'Confusion', 'Changes in mood', 'Difficulty speaking'], 'medicines': ['Donepezil', 'Rivastigmine', 'Memantine', 'Galantamine'], 'precautions': ['Mental exercises', 'Regular physical activity', 'Healthy diet', 'Social engagement', 'Control hypertension']}}
{'disease_name': 'documents\\Stroke', 'disease_info': {'symptoms': ['Sudden numbness', 'Confusion', 'Trouble speaking', 'Loss of balance', 'Severe headache'], 'medicines': ['Alteplase', 'Anticoagulants', 'Aspirin', 'Statins', 'Clot-dissolving drugs'], 'precautions': ['Control blood pressure', 'Quit smoking', 'Manage diabetes', 'Healthy diet', 'Physical activity']}}
{'disease_name': 'documents\\Heart Disease', 'disease_info': {'symptoms': ['Chest pain', 'Shortness of breath', 'Fatigue', 'Irregular heartbeat', 'Swelling in legs'], 'medicines': ['Aspirin', 'Beta-blockers', 'ACE inhibitors', 'Statins', 'Nitrog

'[{\'disease_name\': "documents\\\\Alzheimer\'s Disease", \'disease_info\': {\'symptoms\': [\'Memory loss\', \'Difficulty with problem-solving\', \'Confusion\', \'Changes in mood\', \'Difficulty speaking\'], \'medicines\': [\'Donepezil\', \'Rivastigmine\', \'Memantine\', \'Galantamine\'], \'precautions\': [\'Mental exercises\', \'Regular physical activity\', \'Healthy diet\', \'Social engagement\', \'Control hypertension\']}}, {\'disease_name\': \'documents\\\\Stroke\', \'disease_info\': {\'symptoms\': [\'Sudden numbness\', \'Confusion\', \'Trouble speaking\', \'Loss of balance\', \'Severe headache\'], \'medicines\': [\'Alteplase\', \'Anticoagulants\', \'Aspirin\', \'Statins\', \'Clot-dissolving drugs\'], \'precautions\': [\'Control blood pressure\', \'Quit smoking\', \'Manage diabetes\', \'Healthy diet\', \'Physical activity\']}}, {\'disease_name\': \'documents\\\\Heart Disease\', \'disease_info\': {\'symptoms\': [\'Chest pain\', \'Shortness of breath\', \'Fatigue\', \'Irregular heart

In [68]:
payload = f'''

You are an AI assistant to help user to help with symptoms, possible diseases_name, medicines and precautions of a diseases 

to help with all those information please user our context provided below 

context = {context}

user_question = {question}

'''

print(payload)



You are an AI assistant to help user to help with symptoms, possible diseases_name, medicines and precautions of a diseases 

to help with all those information please user our context provided below 

context = [{'disease_name': "documents\\Alzheimer's Disease", 'disease_info': {'symptoms': ['Memory loss', 'Difficulty with problem-solving', 'Confusion', 'Changes in mood', 'Difficulty speaking'], 'medicines': ['Donepezil', 'Rivastigmine', 'Memantine', 'Galantamine'], 'precautions': ['Mental exercises', 'Regular physical activity', 'Healthy diet', 'Social engagement', 'Control hypertension']}}, {'disease_name': 'documents\\Stroke', 'disease_info': {'symptoms': ['Sudden numbness', 'Confusion', 'Trouble speaking', 'Loss of balance', 'Severe headache'], 'medicines': ['Alteplase', 'Anticoagulants', 'Aspirin', 'Statins', 'Clot-dissolving drugs'], 'precautions': ['Control blood pressure', 'Quit smoking', 'Manage diabetes', 'Healthy diet', 'Physical activity']}}, {'disease_name': 'documents\

In [None]:
client = openai.OpenAI(
    base_url= "https://api.groq.com/openai/v1", #using a Groq's implementation of an open ai compatible service
    api_key = 'YOUR_API_KEY_HERE' #api key is used to authenticate the client and make requests to the api.
)

In [71]:
response = client.chat.completions.create(
    model = 'llama3-8b-8192',
    messages = [
        {
            'role' : 'user',
            'content' : [
                {
                    'type' : 'text',
                    'text' : payload
                }
            ]
        }
    ],
    temperature = 1,
    max_tokens = 2048,
    top_p = 1,
    frequency_penalty = 0,
    presence_penalty = 0,
    response_format = {
        'type' : 'text'
    }
)

temperature:
A parameter that controls the randomness of the output. A temperature of 1 means the model will have some level of randomness (higher temperature = more creativity, lower temperature = more deterministic).

max_tokens:
The maximum number of tokens (words or pieces of words) the model will generate in response. You've set it to 2048 tokens, which allows for a lengthy response.

top_p:
This controls the sampling behavior for generating text. Setting top_p = 1 means that the model will consider the entire probability distribution when generating tokens. Lower values (e.g., 0.9) limit the model to sampling from the top p percentage of tokens, which can make it more focused.

frequency_penalty:
This penalizes the model for using words more frequently. A value of 0 means no penalty, so the model can repeat words freely.

presence_penalty:
This penalizes the model for repeating the same concepts. A value of 0 means no penalty, allowing the model to freely introduce concepts multiple times.

response_format:
The response_format is set to {'type': 'text'} in your code, which seems to be trying to specify the format of the response. However, typically OpenAI's API would return the text directly as part of the response, so this argument may not be needed in most cases. It could be part of Groq's API, but it's not a typical parameter for OpenAI's official API.

In [72]:
print(response.choices[0].message.content)

I'm here to help!

Based on the context you provided, I noticed that memory loss is a symptom associated with Alzheimer's Disease. In fact, it's listed as one of the main symptoms of Alzheimer's in the disease information.

Other diseases that could potentially cause memory loss are not explicitly mentioned in the context, so I'll assume Alzheimer's is a possible cause.

To provide more insight, Alzheimer's Disease is a progressive neurological disorder that typically affects people over the age of 65. While it's not the only cause of memory loss, it's a common symptom in the early stages of the disease.

Here are some potential reasons for memory loss in Alzheimer's:

1. **Brain cell damage**: Alzheimer's causes the death of brain cells, which can lead to memory loss and cognitive decline.
2. **Amyloid protein buildup**: A protein called beta-amyloid accumulates in the brain, forming plaques that disrupt normal brain function and cause memory loss.
3. **Tau protein tangles**: Another 