# Inital exploration of datasets from Kaggle 
https://www.kaggle.com/datasets/itachi9604/disease-symptom-description-dataset?resource=download&select=symptom_precaution.csv

In [31]:
import pandas as pd
import numpy as np

In [2]:
precautions_df = pd.read_csv("symptom_precaution.csv")
symptom_description_df = pd.read_csv("symptom_Description.csv")
severity_df = pd.read_csv("Symptom-severity.csv")
main_df = pd.read_csv("dataset.csv")

In [3]:
precautions_df.head()

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


In [4]:
symptom_description_df.head()

Unnamed: 0,Disease,Description
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...
1,Malaria,An infectious disease caused by protozoan para...
2,Allergy,An allergy is an immune system response to a f...
3,Hypothyroidism,"Hypothyroidism, also called underactive thyroi..."
4,Psoriasis,Psoriasis is a common skin disorder that forms...


In [5]:
severity_df.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


In [6]:
main_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


# strip whitespace in symptoms - the whitespaces caused me a bit of headache:D

In [7]:
symptom_columns = [col for col in main_df.columns if 'Symptom' in col]
for col in symptom_columns:
    main_df[col] = main_df[col].str.strip()

# create severity dict to tally up severities and show to end user

In [8]:
severity_dict = severity_df.set_index('Symptom')['weight'].to_dict()
severity_dict

{'itching': 1,
 'skin_rash': 3,
 'nodal_skin_eruptions': 4,
 'continuous_sneezing': 4,
 'shivering': 5,
 'chills': 3,
 'joint_pain': 3,
 'stomach_pain': 5,
 'acidity': 3,
 'ulcers_on_tongue': 4,
 'muscle_wasting': 3,
 'vomiting': 5,
 'burning_micturition': 6,
 'spotting_urination': 6,
 'fatigue': 4,
 'weight_gain': 3,
 'anxiety': 4,
 'cold_hands_and_feets': 5,
 'mood_swings': 3,
 'weight_loss': 3,
 'restlessness': 5,
 'lethargy': 2,
 'patches_in_throat': 6,
 'irregular_sugar_level': 5,
 'cough': 4,
 'high_fever': 7,
 'sunken_eyes': 3,
 'breathlessness': 4,
 'sweating': 3,
 'dehydration': 4,
 'indigestion': 5,
 'headache': 3,
 'yellowish_skin': 3,
 'dark_urine': 4,
 'nausea': 5,
 'loss_of_appetite': 4,
 'pain_behind_the_eyes': 4,
 'back_pain': 3,
 'constipation': 4,
 'abdominal_pain': 4,
 'diarrhoea': 6,
 'mild_fever': 5,
 'yellow_urine': 4,
 'yellowing_of_eyes': 4,
 'acute_liver_failure': 6,
 'fluid_overload': 4,
 'swelling_of_stomach': 7,
 'swelled_lymph_nodes': 6,
 'malaise': 6,
 'bl

In [9]:
def calculate_severity(row):
    severity_sum = 0
    for symptom in row[1:]:
        if symptom in severity_dict:
            severity_sum += severity_dict[symptom]
    return severity_sum

In [10]:
main_df['Severity_Tally'] = main_df.apply(calculate_severity, axis=1)

In [11]:
main_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Severity_Tally
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,8
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,7
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,5
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,,4
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,,8


# Generate symptom embeddings using BioBert - here i'm applying it to the severity_df as it has a clear list of all possible symptoms

In [12]:
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = BertModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

In [14]:
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy().tolist()

In [15]:
severity_df['embedding'] = severity_df['Symptom'].apply(generate_embedding)

In [16]:
severity_df.head()

Unnamed: 0,Symptom,weight,embedding
0,itching,1,"[[0.04624283313751221, -0.3148941099643707, -0..."
1,skin_rash,3,"[[0.42724546790122986, -0.11916687339544296, -..."
2,nodal_skin_eruptions,4,"[[0.2554945945739746, 0.2000475525856018, -0.0..."
3,continuous_sneezing,4,"[[0.046762436628341675, -0.12162535637617111, ..."
4,shivering,5,"[[0.08400973677635193, -0.15737563371658325, -..."


In [28]:
severity_df.dtypes

Symptom      object
weight        int64
embedding    object
dtype: object

In [33]:
print(severity_df.loc[0]['embedding'])

[[0.04624283313751221, -0.3148941099643707, -0.1941411793231964, 0.21247552335262299, -0.3296336531639099, 0.008714783936738968, -0.16996581852436066, -0.2165904939174652, 0.43590405583381653, 0.12953269481658936, -0.16827964782714844, 0.26141592860221863, -0.603909432888031, -0.13066914677619934, -0.43878957629203796, -0.009407706558704376, 0.04591688513755798, -0.0799274742603302, -0.06668875366449356, 0.2828936278820038, -0.25960999727249146, -0.3205120265483856, -0.18331770598888397, -0.11983804404735565, 0.07208199054002762, 0.012576330453157425, 0.4167575240135193, 0.12795861065387726, -0.2799818813800812, 0.488092839717865, -0.2548738121986389, 0.7026981115341187, -0.09876753389835358, -0.03432649374008179, -0.08601368963718414, 0.39107292890548706, 0.05117487162351608, -0.14397607743740082, 0.1682547926902771, 0.12814252078533173, 0.19677498936653137, -0.09717614948749542, 0.6232703328132629, -0.15469813346862793, 0.16384297609329224, -0.4251737594604492, -0.1429453194141388, -

## need to reformat and flatten list in order to store in weaviate

In [34]:
def ensure_correct_format(df, embedding_column):
    correct_embeddings = []
    for embedding in df[embedding_column]:
        # Flatten the embedding if it's nested
        if isinstance(embedding[0], list):
            embedding = [item for sublist in embedding for item in sublist]

        # Ensure all elements are floats
        embedding = [float(x) for x in embedding]

        correct_embeddings.append(embedding)

    # Check dimensionality
    dim = len(correct_embeddings[0])
    for embedding in correct_embeddings:
        if len(embedding) != dim:
            raise ValueError("Inconsistent embedding dimensionality")

    df[embedding_column] = correct_embeddings

In [35]:
ensure_correct_format(severity_df, 'embedding')

In [36]:
severity_df

Unnamed: 0,Symptom,weight,embedding
0,itching,1,"[0.04624283313751221, -0.3148941099643707, -0...."
1,skin_rash,3,"[0.42724546790122986, -0.11916687339544296, -0..."
2,nodal_skin_eruptions,4,"[0.2554945945739746, 0.2000475525856018, -0.07..."
3,continuous_sneezing,4,"[0.046762436628341675, -0.12162535637617111, 0..."
4,shivering,5,"[0.08400973677635193, -0.15737563371658325, -0..."
...,...,...,...
128,inflammatory_nails,2,"[0.2681790888309479, -0.011954257264733315, -0..."
129,blister,4,"[0.0842055082321167, -0.4026767313480377, -0.1..."
130,red_sore_around_nose,2,"[0.25709471106529236, 0.12028612196445465, -0...."
131,yellow_crust_ooze,3,"[0.2000274807214737, -0.08493164926767349, -0...."


# store embeddings in weaviate

In [17]:
import weaviate

In [20]:
client = weaviate.Client("http://localhost:8080")

In [25]:
client.schema.delete_class("Symptom")

In [26]:
schema = {
    "classes": [
        {
            "class": "Symptom",
            "properties": [
                {"name": "symptom", "dataType": ["string"]},
                {"name": "weight", "dataType": ["int"]},
                {"name": "embedding", "dataType": ["number[]"]},
            ],
        }
    ]
}
client.schema.create(schema)

In [37]:
for index, row in severity_df.iterrows():
    properties = {
        "symptom": row['Symptom'],
        "severity": row['weight'],
        "embedding": row['embedding']
    }
    client.data_object.create(properties, "Symptom")