In [2]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp311-cp311-win_amd64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp311-cp311-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   -------------------- ------------------- 0.8/1.5 MB 2.2 MB/s eta 0:00:01
   --------------------------------- ------ 1.3/1.5 MB 2.3 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 2.3 MB/s  0:00:00
Installing collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.3


In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from rapidfuzz import process, fuzz

In [4]:
sym_df = pd.read_csv(r"C:\Users\hp\Desktop\HEALTHCARE_PROJECT\data\raw\DiseaseAndSymptoms.csv")
prec_df = pd.read_csv(r"C:\Users\hp\Desktop\HEALTHCARE_PROJECT\data\raw\Disease precaution.csv")

print("Symptoms dataset shape:", sym_df.shape)
print("Precaution dataset shape:", prec_df.shape)

Symptoms dataset shape: (4920, 18)
Precaution dataset shape: (41, 5)


In [6]:
sym_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Disease         4920 non-null   object
 1   Symptom_1       4920 non-null   object
 2   Symptom_2       4920 non-null   object
 3   Symptom_3       4920 non-null   object
 4   Symptom_4       4572 non-null   object
 5   Symptom_5       3714 non-null   object
 6   Symptom_6       2934 non-null   object
 7   Symptom_7       2268 non-null   object
 8   Symptom_8       1944 non-null   object
 9   Symptom_9       1692 non-null   object
 10  Symptom_10      1512 non-null   object
 11  Symptom_11      1194 non-null   object
 12  Symptom_12      744 non-null    object
 13  Symptom_13      504 non-null    object
 14  Symptom_14      306 non-null    object
 15  Symptom_15      240 non-null    object
 16  Symptom_16      192 non-null    object
 17  Symptom_17      72 non-null     object
 18  symptoms

In [7]:
prec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Disease       41 non-null     object
 1   Precaution_1  41 non-null     object
 2   Precaution_2  41 non-null     object
 3   Precaution_3  40 non-null     object
 4   Precaution_4  40 non-null     object
dtypes: object(5)
memory usage: 1.7+ KB


In [10]:
# Get only symptom columns
symptom_cols = [col for col in sym_df.columns if col.startswith("Symptom")]

# Convert to lowercase and drop NaNs
for col in symptom_cols:
    sym_df[col] = sym_df[col].astype(str).str.lower().str.strip()
    # Replace 'nan' strings with actual NaN
    sym_df[col] = sym_df[col].replace('nan', np.nan)

# Combine all symptoms into a single list per disease
sym_df["symptoms"] = sym_df[symptom_cols].values.tolist()
sym_df["symptoms"] = sym_df["symptoms"].apply(lambda x: [s for s in x if pd.notna(s) and s != 'nan'])

# Create a master list of all unique symptoms
all_symptoms = sorted(set([s for lst in sym_df["symptoms"] for s in lst]))
print("Total unique symptoms:", len(all_symptoms))

# Binary encoding function
def encode_symptoms(symptom_list):
    return [1 if s in symptom_list else 0 for s in all_symptoms]

sym_df["symptom_vector"] = sym_df["symptoms"].apply(encode_symptoms)

print(sym_df[["Disease", "symptoms"]].head())

Total unique symptoms: 131
            Disease                                           symptoms
0  Fungal infection  [itching, skin_rash, nodal_skin_eruptions, dis...
1  Fungal infection  [skin_rash, nodal_skin_eruptions, dischromic _...
2  Fungal infection  [itching, nodal_skin_eruptions, dischromic _pa...
3  Fungal infection          [itching, skin_rash, dischromic _patches]
4  Fungal infection         [itching, skin_rash, nodal_skin_eruptions]


In [12]:
X = np.array(sym_df['symptom_vector'].tolist())
y = sym_df['Disease']

# Encode target
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=y_enc)

In [13]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))


Model Accuracy: 1.0
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        24
                                   AIDS       1.00      1.00      1.00        24
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        24
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        24
                       Bronchial Asthma       1.00      1.00      1.00        24
                   Cervical spondylosis       1.00      1.00      1.00        24
                            Chicken pox       1.00      1.00      1.00        24
                    Chronic cholestasis       1.00      1.00      1.00        24
                            Common Cold       1.00      1.00      1.00        24
      

In [14]:
prec_dict = {}
for _, row in prec_df.iterrows():
    disease = row['Disease']
    precautions = [row[col] for col in prec_df.columns if col.startswith("Precaution") and pd.notna(row[col])]
    prec_dict[disease.lower()] = precautions

In [15]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.strip()

def extract_symptoms(user_input):
    cleaned = clean_text(user_input)
    tokens = cleaned.split()
    extracted = set()
    for word in tokens:
        match, score, _ = process.extractOne(word, all_symptoms, scorer=fuzz.partial_ratio)
        if score > 80:
            extracted.add(match)
    return list(extracted)

In [16]:
def predict_disease(user_input):
    extracted = extract_symptoms(user_input)
    if not extracted:
        return "No clear symptoms detected.", [], []
    
    vec = np.array(encode_symptoms(extracted)).reshape(1, -1)
    pred_idx = model.predict(vec)[0]
    pred_disease = le.inverse_transform([pred_idx])[0]
    
    precautions = prec_dict.get(pred_disease.lower(), ["No specific precautions found"])
    
    return pred_disease, extracted, precautions

In [17]:
examples = [
    "I have fever and cough.",
    "My throat is sore and I feel very tired.",
    "Severe chest pain and sweating.",
    "Sneezing, runny nose and congestion."
]

for ex in examples:
    disease, symptoms, precs = predict_disease(ex)
    print("\nInput:", ex)
    print("Extracted symptoms:", symptoms)
    print("Predicted disease:", disease)
    print("Precautions:", precs)


Input: I have fever and cough.
Extracted symptoms: ['cough', 'abdominal_pain', 'blurred_and_distorted_vision', 'high_fever']
Predicted disease: AIDS
Precautions: ['avoid open cuts', 'wear ppe if possible', 'consult doctor', 'follow up']

Input: My throat is sore and I feel very tired.
Extracted symptoms: ['abdominal_pain', 'bladder_discomfort', 'red_sore_around_nose', 'continuous_feel_of_urine', 'high_fever', 'patches_in_throat', 'blurred_and_distorted_vision']
Predicted disease: Urinary tract infection
Precautions: ['drink plenty of water', 'increase vitamin c intake', 'drink cranberry juice', 'take probiotics']

Input: Severe chest pain and sweating.
Extracted symptoms: ['abdominal_pain', 'blurred_and_distorted_vision', 'sweating', 'chest_pain']
Predicted disease: Heart attack
Precautions: ['call ambulance', 'chew or swallow asprin', 'keep calm']

Input: Sneezing, runny nose and congestion.
Extracted symptoms: ['continuous_sneezing', 'red_sore_around_nose', 'congestion', 'runny_nose

In [19]:
import pickle
import joblib
import os

# Define output directory
output_dir = r"C:\Users\hp\Desktop\HEALTHCARE_PROJECT\models"
os.makedirs(output_dir, exist_ok=True)

# Save the trained model using joblib (recommended for sklearn models)
model_path = os.path.join(output_dir, "disease_classifier_model.pkl")
joblib.dump(model, model_path)
print(f"Model saved to: {model_path}")

# Also save the label encoder and symptoms list for later use
le_path = os.path.join(output_dir, "label_encoder.pkl")
joblib.dump(le, le_path)
print(f"Label encoder saved to: {le_path}")

symptoms_path = os.path.join(output_dir, "all_symptoms.pkl")
joblib.dump(all_symptoms, symptoms_path)
print(f"Symptoms list saved to: {symptoms_path}")

print("\nAll model files saved successfully!")

Model saved to: C:\Users\hp\Desktop\HEALTHCARE_PROJECT\models\disease_classifier_model.pkl
Label encoder saved to: C:\Users\hp\Desktop\HEALTHCARE_PROJECT\models\label_encoder.pkl
Symptoms list saved to: C:\Users\hp\Desktop\HEALTHCARE_PROJECT\models\all_symptoms.pkl

All model files saved successfully!


In [22]:
'''import joblib

model = joblib.load("disease_classifier_model.pkl")
le = joblib.load("label_encoder.pkl")
all_symptoms = joblib.load("all_symptoms.pkl")

# Then use for predictions
predictions = model.predict(new_data)
decoded_predictions = le.inverse_transform(predictions)
For further use of model'''

'import joblib\n\nmodel = joblib.load("disease_classifier_model.pkl")\nle = joblib.load("label_encoder.pkl")\nall_symptoms = joblib.load("all_symptoms.pkl")\n\n# Then use for predictions\npredictions = model.predict(new_data)\ndecoded_predictions = le.inverse_transform(predictions)\nFor further use of model'