In [1]:
import pandas as pd

df = pd.read_csv("../data/Disease precaution.csv")

print(df.shape)
display(df.head())


(41, 5)


Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


In [2]:
# View column names
print(df.columns)

# Check for missing values
print(df.isnull().sum())


Index(['Disease', 'Precaution_1', 'Precaution_2', 'Precaution_3',
       'Precaution_4'],
      dtype='object')
Disease         0
Precaution_1    0
Precaution_2    0
Precaution_3    1
Precaution_4    1
dtype: int64


In [3]:
symptom_df = pd.read_csv("../data/disease_symptom.csv")

print(symptom_df.shape)
display(symptom_df.head())


(4920, 18)


Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [4]:
# Fill NaN values with empty strings
symptom_df_filled = symptom_df.fillna("")

# Separate target
y = symptom_df_filled["Disease"]

# Combine all symptom columns into a single list per row
symptom_cols = [col for col in symptom_df_filled.columns if col.startswith("Symptom")]
X_symptoms = symptom_df_filled[symptom_cols]

# One-hot encode symptoms
X = pd.get_dummies(X_symptoms.stack()).groupby(level=0).sum()

print("Feature matrix shape:", X.shape)
print("Target shape:", y.shape)


Feature matrix shape: (4920, 132)
Target shape: (4920,)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize model
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

# Train model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", round(accuracy, 4))


Test Accuracy: 1.0


In [6]:
import numpy as np

# Get prediction probabilities
y_proba = model.predict_proba(X_test)

# Get class labels
classes = model.classes_

def get_top_k_predictions(proba_row, classes, k=3):
    top_k_idx = np.argsort(proba_row)[-k:][::-1]
    return [(classes[i], proba_row[i]) for i in top_k_idx]

# Example: Top-3 predictions for the first test sample
sample_idx = 0
top_3 = get_top_k_predictions(y_proba[sample_idx], classes, k=3)

print("Top-3 predicted diseases:")
for disease, score in top_3:
    print(f"{disease} — confidence: {score:.2f}")


Top-3 predicted diseases:
Hypertension  — confidence: 1.00
hepatitis A — confidence: 0.00
Common Cold — confidence: 0.00


In [7]:
# Get symptom list for the selected sample
sample_symptoms = X_test.iloc[sample_idx]
active_symptoms = sample_symptoms[sample_symptoms == 1].index.tolist()

print("User-reported symptoms:")
print(active_symptoms)


User-reported symptoms:
[' chest_pain', ' dizziness', ' lack_of_concentration', ' loss_of_balance']


In [8]:
def generate_explanation(symptoms, predictions):
    explanation = f"""
The following health conditions are suggested based on the reported symptoms.
This is a preliminary assessment and not a medical diagnosis.

Reported symptoms:
{', '.join(symptoms)}

Possible conditions and explanations:
"""
    for disease, confidence in predictions:
        explanation += f"""
- {disease} (confidence: {confidence:.2f}):
  This condition is suggested because it is commonly associated with symptoms such as
  {', '.join(symptoms[:2])}. However, symptoms may overlap with other conditions.

"""
    explanation += """
Please consult a qualified healthcare professional for an accurate diagnosis and treatment.
"""
    return explanation

# Generate explanation text
explanation_text = generate_explanation(active_symptoms, top_3)
print(explanation_text)



The following health conditions are suggested based on the reported symptoms.
This is a preliminary assessment and not a medical diagnosis.

Reported symptoms:
 chest_pain,  dizziness,  lack_of_concentration,  loss_of_balance

Possible conditions and explanations:

- Hypertension  (confidence: 1.00):
  This condition is suggested because it is commonly associated with symptoms such as
   chest_pain,  dizziness. However, symptoms may overlap with other conditions.


- hepatitis A (confidence: 0.00):
  This condition is suggested because it is commonly associated with symptoms such as
   chest_pain,  dizziness. However, symptoms may overlap with other conditions.


- Common Cold (confidence: 0.00):
  This condition is suggested because it is commonly associated with symptoms such as
   chest_pain,  dizziness. However, symptoms may overlap with other conditions.


Please consult a qualified healthcare professional for an accurate diagnosis and treatment.



In [9]:
# Load disease precautions
precaution_df = pd.read_csv("../data/Disease precaution.csv")

precaution_df.head()


Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


In [10]:
def get_precautions(disease_name, precaution_df):
    row = precaution_df[precaution_df["Disease"] == disease_name]
    if row.empty:
        return []
    precautions = row.iloc[0, 1:].dropna().tolist()
    return precautions

# Example: show precautions for top predictions
for disease, _ in top_3:
    precautions = get_precautions(disease, precaution_df)
    print(f"\nPrecautions for {disease}:")
    for p in precautions:
        print("-", p)



Precautions for Hypertension :
- meditation
- salt baths
- reduce stress
- get proper sleep

Precautions for hepatitis A:
- Consult nearest hospital
- wash hands through
- avoid fatty spicy food
- medication

Precautions for Common Cold:
- drink vitamin c rich drinks
- take vapour
- avoid cold food
- keep fever in check
