In [37]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import pickle

In [38]:
# Load the original symptom-to-disease dataset
df_symptoms = pd.read_csv(r"E:\Mehak Docs\path lab app\chatbot\disease prediction\dataset.csv")
df_severity = pd.read_csv(r"E:\Mehak Docs\path lab app\chatbot\disease prediction\Symptom-severity.csv")

In [39]:
# Preprocess datasets
df_symptoms.fillna('none', inplace=True)
df_severity["Symptom"] = df_severity["Symptom"].str.strip().str.lower()

In [40]:
# Create a mapping of symptom to severity weight
severity_dict = dict(zip(df_severity["Symptom"], df_severity["weight"]))


In [41]:
# Re-extract and clean all unique symptoms from the symptom dataset
all_symptoms = pd.unique(df_symptoms.iloc[:, 1:].values.ravel())
all_symptoms = [symptom.strip().lower().replace(" ", "_") for symptom in all_symptoms if symptom != 'none']
all_symptoms

['itching',
 'skin_rash',
 'nodal_skin_eruptions',
 'dischromic__patches',
 'continuous_sneezing',
 'shivering',
 'chills',
 'watering_from_eyes',
 'stomach_pain',
 'acidity',
 'ulcers_on_tongue',
 'vomiting',
 'cough',
 'chest_pain',
 'yellowish_skin',
 'nausea',
 'loss_of_appetite',
 'abdominal_pain',
 'yellowing_of_eyes',
 'burning_micturition',
 'spotting__urination',
 'passage_of_gases',
 'internal_itching',
 'indigestion',
 'muscle_wasting',
 'patches_in_throat',
 'high_fever',
 'extra_marital_contacts',
 'fatigue',
 'weight_loss',
 'restlessness',
 'lethargy',
 'irregular_sugar_level',
 'blurred_and_distorted_vision',
 'obesity',
 'excessive_hunger',
 'increased_appetite',
 'polyuria',
 'sunken_eyes',
 'dehydration',
 'diarrhoea',
 'breathlessness',
 'family_history',
 'mucoid_sputum',
 'headache',
 'dizziness',
 'loss_of_balance',
 'lack_of_concentration',
 'stiff_neck',
 'depression',
 'irritability',
 'visual_disturbances',
 'back_pain',
 'weakness_in_limbs',
 'neck_pain',
 '

In [42]:
# Log symptoms not found in severity_dict
missing_symptoms = [symptom for symptom in all_symptoms if symptom not in severity_dict]

# Assign default severity of 1 to unknown symptoms
for symptom in missing_symptoms:
    severity_dict[symptom] = 1  # assume minimal severity

In [43]:
def encode_symptoms_with_severity(row, all_symptoms, severity_dict):
    row_symptoms = set(row[1:].str.strip().str.lower().str.replace(" ", "_"))
    return [severity_dict[symptom] if symptom in row_symptoms else 0 for symptom in all_symptoms]

# Create feature matrix
X = df_symptoms.apply(lambda row: encode_symptoms_with_severity(row, all_symptoms, severity_dict), axis=1, result_type='expand')


In [11]:
df_symptoms.shape

(4920, 18)

In [44]:
# Encode target labels
le = LabelEncoder()
y = le.fit_transform(df_symptoms["Disease"])

In [45]:
# Split and train the weighted model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_weighted = RandomForestClassifier(n_estimators=100, random_state=42)
model_weighted.fit(X_train, y_train)


In [None]:
# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Train RandomForestClassifier
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)


In [46]:
# Predict on test set
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0)

In [47]:
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
report[:1000]  # Display only the first part of the report for brevity


Model Accuracy: 100.00%


'                                         precision    recall  f1-score   support\n\n(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        18\n                                   AIDS       1.00      1.00      1.00        30\n                                   Acne       1.00      1.00      1.00        24\n                    Alcoholic hepatitis       1.00      1.00      1.00        25\n                                Allergy       1.00      1.00      1.00        24\n                              Arthritis       1.00      1.00      1.00        23\n                       Bronchial Asthma       1.00      1.00      1.00        33\n                   Cervical spondylosis       1.00      1.00      1.00        23\n                            Chicken pox       1.00      1.00      1.00        21\n                    Chronic cholestasis       1.00      1.00      1.00        15\n                            Common Cold       1.00      1.00      1.00        23\n             

In [48]:
# Test with example symptoms
example_symptoms = ["itching", "nodal_skin_eruptions", "continuous_sneezing", "shivering", "chills", "joint_pain", "stomach_pain", "acidity"]
input_vector = [1 if symptom in [s.strip().lower() for s in example_symptoms] else 0 for symptom in all_symptoms]
predicted_class_index = model.predict([input_vector])[0]
predicted_disease = le.inverse_transform([predicted_class_index])[0]

predicted_disease

'Allergy'

In [49]:
import pickle
import json

# Save trained model
with open("model_weighted.pkl", "wb") as f:
    pickle.dump(model_weighted, f)

# Save label encoder
with open("le_disease.pkl", "wb") as f:
    pickle.dump(le, f)

# Save symptom columns (used in input vector)
with open("columns.json", "w") as f:
    json.dump(all_symptoms, f)

"✅ Model, label encoder, and symptom column list saved successfully."


'✅ Model, label encoder, and symptom column list saved successfully.'