# Inital exploration of datasets from Kaggle 
https://www.kaggle.com/datasets/itachi9604/disease-symptom-description-dataset?resource=download&select=symptom_precaution.csv

In [2]:
import pandas as pd
import numpy as np

In [3]:
precautions_df = pd.read_csv("../data/symptom_precaution.csv")
symptom_description_df = pd.read_csv("../data/symptom_Description.csv")
severity_df = pd.read_csv("../data/Symptom-severity.csv")
main_df = pd.read_csv("../data/dataset.csv")

In [4]:
precautions_df.head()

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


In [5]:
symptom_description_df.head()

Unnamed: 0,Disease,Description
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...
1,Malaria,An infectious disease caused by protozoan para...
2,Allergy,An allergy is an immune system response to a f...
3,Hypothyroidism,"Hypothyroidism, also called underactive thyroi..."
4,Psoriasis,Psoriasis is a common skin disorder that forms...


In [6]:
severity_df.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


In [7]:
main_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


# strip whitespace in symptoms - the whitespaces caused me a bit of headache:D

In [8]:
symptom_columns = [col for col in main_df.columns if 'Symptom' in col]
for col in symptom_columns:
    main_df[col] = main_df[col].str.strip()

# create severity dict to tally up severities and show to end user

In [9]:
severity_dict = severity_df.set_index('Symptom')['weight'].to_dict()
severity_dict

{'itching': 1,
 'skin_rash': 3,
 'nodal_skin_eruptions': 4,
 'continuous_sneezing': 4,
 'shivering': 5,
 'chills': 3,
 'joint_pain': 3,
 'stomach_pain': 5,
 'acidity': 3,
 'ulcers_on_tongue': 4,
 'muscle_wasting': 3,
 'vomiting': 5,
 'burning_micturition': 6,
 'spotting_urination': 6,
 'fatigue': 4,
 'weight_gain': 3,
 'anxiety': 4,
 'cold_hands_and_feets': 5,
 'mood_swings': 3,
 'weight_loss': 3,
 'restlessness': 5,
 'lethargy': 2,
 'patches_in_throat': 6,
 'irregular_sugar_level': 5,
 'cough': 4,
 'high_fever': 7,
 'sunken_eyes': 3,
 'breathlessness': 4,
 'sweating': 3,
 'dehydration': 4,
 'indigestion': 5,
 'headache': 3,
 'yellowish_skin': 3,
 'dark_urine': 4,
 'nausea': 5,
 'loss_of_appetite': 4,
 'pain_behind_the_eyes': 4,
 'back_pain': 3,
 'constipation': 4,
 'abdominal_pain': 4,
 'diarrhoea': 6,
 'mild_fever': 5,
 'yellow_urine': 4,
 'yellowing_of_eyes': 4,
 'acute_liver_failure': 6,
 'fluid_overload': 4,
 'swelling_of_stomach': 7,
 'swelled_lymph_nodes': 6,
 'malaise': 6,
 'bl

In [10]:
def calculate_severity(row):
    severity_sum = 0
    for symptom in row[1:]:
        if symptom in severity_dict:
            severity_sum += severity_dict[symptom]
    return severity_sum

In [11]:
main_df['Severity_Tally'] = main_df.apply(calculate_severity, axis=1)

In [12]:
main_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Severity_Tally
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,8
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,7
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,5
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,,4
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,,8


# Generate symptom embeddings using BioBert - here i'm applying it to the severity_df as it has a clear list of all possible symptoms

In [13]:
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = BertModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

In [33]:
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy().flatten().tolist()
    return embedding

In [35]:
severity_df.head()

Unnamed: 0,Symptom,weight,embedding
0,itching,1,"[0.04624283313751221, -0.3148941099643707, -0...."
1,skin_rash,3,"[0.42724546790122986, -0.11916687339544296, -0..."
2,nodal_skin_eruptions,4,"[0.2554945945739746, 0.2000475525856018, -0.07..."
3,continuous_sneezing,4,"[0.046762436628341675, -0.12162535637617111, 0..."
4,shivering,5,"[0.08400973677635193, -0.15737563371658325, -0..."


In [18]:
severity_df.dtypes

Symptom      object
weight        int64
embedding    object
dtype: object

In [19]:
print(severity_df.loc[0]['embedding'])

[[0.04624283313751221, -0.3148941099643707, -0.1941411793231964, 0.21247552335262299, -0.3296336531639099, 0.008714783936738968, -0.16996581852436066, -0.2165904939174652, 0.43590405583381653, 0.12953269481658936, -0.16827964782714844, 0.26141592860221863, -0.603909432888031, -0.13066914677619934, -0.43878957629203796, -0.009407706558704376, 0.04591688513755798, -0.0799274742603302, -0.06668875366449356, 0.2828936278820038, -0.25960999727249146, -0.3205120265483856, -0.18331770598888397, -0.11983804404735565, 0.07208199054002762, 0.012576330453157425, 0.4167575240135193, 0.12795861065387726, -0.2799818813800812, 0.488092839717865, -0.2548738121986389, 0.7026981115341187, -0.09876753389835358, -0.03432649374008179, -0.08601368963718414, 0.39107292890548706, 0.05117487162351608, -0.14397607743740082, 0.1682547926902771, 0.12814252078533173, 0.19677498936653137, -0.09717614948749542, 0.6232703328132629, -0.15469813346862793, 0.16384297609329224, -0.4251737594604492, -0.1429453194141388, -

## need to reformat and flatten list in order to store in weaviate

In [20]:
def ensure_correct_format(df, embedding_column):
    correct_embeddings = []
    for embedding in df[embedding_column]:
        # Flatten the embedding if it's nested
        if isinstance(embedding[0], list):
            embedding = [item for sublist in embedding for item in sublist]

        # Ensure all elements are floats
        embedding = [float(x) for x in embedding]

        correct_embeddings.append(embedding)

    # Check dimensionality
    dim = len(correct_embeddings[0])
    for embedding in correct_embeddings:
        if len(embedding) != dim:
            raise ValueError("Inconsistent embedding dimensionality")

    df[embedding_column] = correct_embeddings

In [21]:
ensure_correct_format(severity_df, 'embedding')

In [22]:
severity_df

Unnamed: 0,Symptom,weight,embedding
0,itching,1,"[0.04624283313751221, -0.3148941099643707, -0...."
1,skin_rash,3,"[0.42724546790122986, -0.11916687339544296, -0..."
2,nodal_skin_eruptions,4,"[0.2554945945739746, 0.2000475525856018, -0.07..."
3,continuous_sneezing,4,"[0.046762436628341675, -0.12162535637617111, 0..."
4,shivering,5,"[0.08400973677635193, -0.15737563371658325, -0..."
...,...,...,...
128,inflammatory_nails,2,"[0.2681790888309479, -0.011954257264733315, -0..."
129,blister,4,"[0.0842055082321167, -0.4026767313480377, -0.1..."
130,red_sore_around_nose,2,"[0.25709471106529236, 0.12028612196445465, -0...."
131,yellow_crust_ooze,3,"[0.2000274807214737, -0.08493164926767349, -0...."


# store embeddings in weaviate

In [23]:
import weaviate

In [26]:
client = weaviate.Client("http://localhost:8080")

In [27]:
client.schema.delete_class("Symptom")

In [28]:
schema = {
    "classes": [
        {
            "class": "Symptom",
            "properties": [
                {"name": "symptom", "dataType": ["string"]},
                {"name": "weight", "dataType": ["int"]},
                {"name": "embedding", "dataType": ["number[]"]},
            ],
        }
    ]
}
client.schema.create(schema)

In [29]:
for index, row in severity_df.iterrows():
    properties = {
        "symptom": row['Symptom'],
        "severity": row['weight'],
        "embedding": row['embedding']
    }
    client.data_object.create(properties, "Symptom")

## debug

In [43]:
schema = client.schema.get()
schema

{'classes': [{'class': 'Symptom',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'multiTenancyConfig': {'enabled': False},
   'properties': [{'dataType': ['text'],
     'indexFilterable': True,
     'indexSearchable': True,
     'name': 'symptom',
     'tokenization': 'whitespace'},
    {'dataType': ['int'],
     'indexFilterable': True,
     'indexSearchable': False,
     'name': 'weight'},
    {'dataType': ['number[]'],
     'indexFilterable': True,
     'indexSearchable': False,
     'name': 'embedding'},
    {'dataType': ['number'],
     'description': "This property was generated by Weaviate's auto-schema feature on Thu May 30 14:42:33 2024",
     'indexFilterable': True,
     'indexSearchable': False,
     'name': 'severity'}],
   'replicationConfig': {'factor': 1},
   'shardingConfig': {'virtualPerPhysical': 128,
    'desiredCount': 1,
    'actualCount': 1,
 

In [None]:
symptoms = client.data_object.get(class_name='Symptom')
#symptoms

In [41]:
test_embedding = generate_embedding("headache")
result = client.query.get("Symptom", ["symptom", "severity"]) \
    .with_near_vector({"vector": test_embedding}) \
    .with_limit(5) \
    .do()
print(result)

{'data': {'Get': {'Symptom': []}}}


In [None]:
symptoms = client.data_object.get(class_name='Symptom')
# print("Stored Symptoms:", symptoms)

In [38]:
# Extract a known symptom and its embedding for testing
if symptoms['objects']:
    known_symptom = symptoms['objects'][0]['properties']['symptom']
    known_embedding = symptoms['objects'][0]['properties']['embedding']
    print(f"Known Symptom: {known_symptom}")
    print(f"Known Embedding: {known_embedding}")

    # Generate embedding for the known symptom to verify consistency
    generated_embedding = generate_embedding(known_symptom)
    print(f"Generated Embedding: {generated_embedding}")

    # Run the manual query test with the known embedding
    try:
        result = client.query.get("Symptom", ["symptom", "severity"]) \
            .with_near_vector({"vector": known_embedding}) \
            .with_limit(5) \
            .do()
        print("Query Result with Known Embedding:", result)
    except Exception as e:
        print(f"Error: {e}")
else:
    print("No symptoms found in the database.")

Known Symptom: muscle_pain
Known Embedding: [0.32424604892730713, 0.23469121754169464, -0.09983646869659424, -0.1731172800064087, -0.29152312874794006, 0.051206864416599274, 0.057245247066020966, -0.11712696403265, 0.13084077835083008, -0.2498432695865631, -0.010016286745667458, 0.4176817834377289, -0.12075481563806534, -0.028884679079055786, -0.6259153485298157, 0.4932721257209778, 0.23916783928871155, -0.010299724526703358, 0.010339946486055851, 0.3510693311691284, -0.33583784103393555, -0.31987518072128296, -0.02234695479273796, 0.05441699177026749, -0.35131046175956726, 0.024882400408387184, 0.388677179813385, 0.18420109152793884, -0.49853944778442383, 0.6520998477935791, -0.16212128102779388, 0.052516769617795944, -0.06285600364208221, -0.13038688898086548, -0.39115676283836365, 0.2945471405982971, -0.0848926231265068, -0.2627870738506317, 0.030456310138106346, 0.07487180083990097, 0.07883138954639435, 0.30782073736190796, 0.638996422290802, -0.4256522059440613, 0.3065973520278930

In [39]:
vectorIndexConfig = client.schema.get()["classes"][0]["vectorIndexConfig"]
print("Vector Index Config:", vectorIndexConfig)

Vector Index Config: {'skip': False, 'cleanupIntervalSeconds': 300, 'maxConnections': 64, 'efConstruction': 128, 'ef': -1, 'dynamicEfMin': 100, 'dynamicEfMax': 500, 'dynamicEfFactor': 8, 'vectorCacheMaxObjects': 1000000000000, 'flatSearchCutoff': 40000, 'distance': 'cosine', 'pq': {'enabled': False, 'bitCompression': False, 'segments': 0, 'centroids': 256, 'trainingLimit': 100000, 'encoder': {'type': 'kmeans', 'distribution': 'log-normal'}}, 'bq': {'enabled': False}}


In [78]:
embeddings_df = pd.read_csv("../data/dataset.csv")

In [79]:
embeddings_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,severity_tally,diagnosis_embedding
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,8,"[0.23544436693191528, -0.047748446464538574, -..."
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,7,"[0.2985115349292755, 0.041300106793642044, -0...."
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,5,"[0.17151065170764923, -0.02394230104982853, -0..."
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,,4,"[0.2287609577178955, -0.1303471177816391, -0.0..."
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,,8,"[0.2429943084716797, -0.07800447940826416, -0...."


In [80]:
def check_saved_diagnosis_embeddings(client):
    query = (
        client.query
        .get("Diagnosis", ["diagnosis", "severity", "embedding"])
        .with_limit(100)  # Adjust the limit according to your data size
        .do()
    )
    
    diagnoses = query['data']['Get']['Diagnosis']
    if not diagnoses:
        print("No diagnosis embeddings found in Weaviate.")
    else:
        for diagnosis in diagnoses:
            print(f"Diagnosis: {diagnosis['diagnosis']}, Severity: {diagnosis['severity']}, Embedding: {diagnosis['embedding']}")

In [91]:
# Retrieve and display the saved Diagnosis objects
#check_saved_diagnosis_embeddings(client)

In [82]:
# Function to generate embedding for a given symptom
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the mean of the last hidden state as the embedding
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy().tolist()
    return embedding

In [83]:
# Function to query Weaviate for similar diagnosis embeddings
def query_similar_diagnosis(symptom_embedding, client):
    near_vector = {"vector": symptom_embedding}
    query = (
        client.query
        .get("Diagnosis", ["diagnosis", "severity", "embedding"])
        .with_near_vector(near_vector)
        .with_limit(5)  # Adjust the limit based on how many results you want to retrieve
        .do()
    )
    return query

In [87]:
# Test a specific symptom
test_symptom = "sore throat"
symptom_embedding = generate_embedding(test_symptom)
response = query_similar_diagnosis(symptom_embedding, client)

import json
# Print the query results
print(json.dumps(response, indent=2))

{
  "data": {
    "Get": {
      "Diagnosis": []
    }
  }
}


In [89]:
def query_symptom_embeddings(client, limit=10):
    query = (
        client.query
        .get("Symptom", ["symptom", "severity", "embedding"])
        .with_limit(limit)  # Adjust the limit based on how many results you want to retrieve
        .do()
    )
    return query

In [90]:
# Fetch the symptom embeddings
response = query_symptom_embeddings(client, limit=10)

# Print the query results
print(json.dumps(response, indent=2))

{
  "data": {
    "Get": {
      "Symptom": [
        {
          "embedding": [
            0.1693209409713745,
            -0.018382463604211807,
            -0.13959023356437683,
            -0.2687894403934479,
            -0.49426642060279846,
            -0.08078722655773163,
            0.2705497145652771,
            -0.005864876322448254,
            0.22439822554588318,
            -0.3076511025428772,
            -0.0523722767829895,
            0.2532358467578888,
            -0.4007382392883301,
            0.03779812902212143,
            -0.6879614591598511,
            0.3909243643283844,
            0.12773558497428894,
            -0.29505786299705505,
            -0.0006379425758495927,
            0.0376492515206337,
            -0.580582320690155,
            -0.43120265007019043,
            -0.029262671247124672,
            0.22127120196819305,
            -0.09359774738550186,
            0.16362974047660828,
            0.16118916869163513,
            -0.0148

In [102]:
# Function to generate embedding for a symptom
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy().tolist()
    return embedding

# Function to compare user input with diagnosis embeddings
def query_diagnosis(embedding, client):
    response = (
        client.query
        .get("Diagnosis", ["diagnosis", "severity", "embedding"])
        .with_near_vector({"vector": embedding})
        .with_additional("certainty")
        .do()
    )
    return response

In [103]:
# Example user input
user_symptoms = ["sore throat"]

# Generate embedding for the user input
user_embeddings = [generate_embedding(symptom) for symptom in user_symptoms]
combined_embedding = [sum(x) / len(user_embeddings) for x in zip(*user_embeddings)]

# Fetch and print the diagnosis
response = query_diagnosis(combined_embedding, client)
print(json.dumps(response, indent=2))

{
  "data": {
    "Get": {
      "Diagnosis": []
    }
  }
}


In [104]:
def fetch_all_diagnoses(client):
    response = (
        client.query
        .get("Diagnosis", ["diagnosis", "severity", "embedding"])
        .do()
    )
    return response

In [105]:
# Fetch and print all diagnoses
response = fetch_all_diagnoses(client)
print(json.dumps(response, indent=2))

{
  "data": {
    "Get": {
      "Diagnosis": [
        {
          "diagnosis": "Urinary tract infection",
          "embedding": [
            0.13722233474254608,
            0.07482747733592987,
            0.019691409543156624,
            -0.14284475147724152,
            -0.43348902463912964,
            -0.09110367298126221,
            0.023640120401978493,
            0.005642184056341648,
            0.028674175962805748,
            -0.13265198469161987,
            -0.055891916155815125,
            0.010919855907559395,
            0.03159509226679802,
            -0.056190915405750275,
            -0.4583670496940613,
            0.2548285722732544,
            0.03456829860806465,
            -0.32340431213378906,
            -0.04393494874238968,
            0.1636495590209961,
            -0.3098450005054474,
            -0.3336860239505768,
            0.04573771730065346,
            0.20190757513046265,
            -0.18402166664600372,
            -0.1101668477058

In [112]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import weaviate
from sklearn.metrics.pairwise import cosine_similarity

# Load BioBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = BertModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    return embedding

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [125]:
# Example user input
user_symptoms = ["itching", "skin rash", "skin eruptions"]
user_embeddings = [generate_embedding(symptom) for symptom in user_symptoms]
combined_embedding = np.mean(user_embeddings, axis=0)

#print("Combined User Embedding:", combined_embedding)

# Initialize Weaviate client
client = weaviate.Client("http://localhost:8080")

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


In [114]:
# Query to fetch all diagnosis embeddings
query = """
{
  Get {
    Diagnosis {
      diagnosis
      embedding
    }
  }
}
"""

response = client.query.raw(query)
diagnosis_data = response['data']['Get']['Diagnosis']

In [124]:
# Extract embeddings and diagnoses
diagnoses = [entry['diagnosis'] for entry in diagnosis_data]
embeddings = [entry['embedding'] for entry in diagnosis_data]

#print("Retrieved Diagnoses and Embeddings:", diagnoses, embeddings)

In [116]:
# Calculate cosine similarity
similarities = cosine_similarity([combined_embedding], embeddings)

In [118]:
diagnosis_similarity_pairs = list(zip(diagnoses, similarities[0]))

# Sort the diagnoses by similarity in descending order
sorted_diagnoses = sorted(diagnosis_similarity_pairs, key=lambda x: x[1], reverse=True)

# Print sorted diagnoses with similarities
for diagnosis, similarity in sorted_diagnoses:
    print(f"Diagnosis: {diagnosis}, Similarity: {similarity}")

Diagnosis: Fungal infection, Similarity: 0.9623856314642407
Diagnosis: Fungal infection, Similarity: 0.9471578810734789
Diagnosis: Fungal infection, Similarity: 0.9471578810734789
Diagnosis: Fungal infection, Similarity: 0.9471578810734789
Diagnosis: Drug Reaction, Similarity: 0.9329325656666339
Diagnosis: Drug Reaction, Similarity: 0.9329325656666335
Diagnosis: Chicken pox, Similarity: 0.9288246521296666
Diagnosis: Chicken pox, Similarity: 0.9288246521296666
Diagnosis: Chicken pox, Similarity: 0.9275959271909257
Diagnosis: Impetigo, Similarity: 0.9235873409656443
Diagnosis: Jaundice, Similarity: 0.9210339273118653
Diagnosis: Typhoid, Similarity: 0.9184880094675266
Diagnosis: Typhoid, Similarity: 0.9180338232573579
Diagnosis: Jaundice, Similarity: 0.9155951326355952
Diagnosis: Pneumonia, Similarity: 0.9146827396916842
Diagnosis: Dengue, Similarity: 0.9139406049809357
Diagnosis: Dengue, Similarity: 0.9139406049809357
Diagnosis: GERD, Similarity: 0.9130115746314421
Diagnosis: Acne, Simil

In [119]:
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    return embedding

In [122]:
def query_diagnosis(embedding, client):
    query = """
    {
      Get {
        Diagnosis(
          nearVector: {vector: %s, certainty: 0.7}
        ) {
          diagnosis
          embedding
        }
      }
    }
    """ % embedding.tolist()

    response = client.query.raw(query)
    return response

In [123]:
# Example user input
user_symptoms = ["itching", "skin rash", "skin eruptions"]
user_embeddings = [generate_embedding(symptom) for symptom in user_symptoms]
combined_embedding = np.mean(user_embeddings, axis=0)

# Fetch and print the diagnosis from Weaviate
response = query_diagnosis(combined_embedding, client)
print(json.dumps(response, indent=2))

# Check the results
if response['data']['Get']['Diagnosis']:
    print("Diagnoses found:")
    for diagnosis in response['data']['Get']['Diagnosis']:
        print(f"Diagnosis: {diagnosis['diagnosis']}")
else:
    print("No diagnosis found")

{
  "data": {
    "Get": {
      "Diagnosis": []
    }
  }
}
No diagnosis found
