In [1]:
import pandas as pd
df = pd.read_csv("symbipredict_2022.csv")   # or full path
print(df.shape)
print(df.columns.tolist()[:40])
# Inspect candidate targets:
for c in ['disease','diagnosis','target','label','outcome','acidity','prognosis','status']:
    if c in df.columns:
        print("Found candidate target:", c, df[c].value_counts(dropna=False).head())
# Fallback: show last column uniques
last = df.columns[-1]
print("Last column:", last, df[last].value_counts(dropna=False).head())

(4961, 133)
['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing', 'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity', 'ulcers_on_tongue', 'muscle_wasting', 'vomiting', 'burning_micturition', 'spotting_ urination', 'fatigue', 'weight_gain', 'anxiety', 'cold_hands_and_feets', 'mood_swings', 'weight_loss', 'restlessness', 'lethargy', 'patches_in_throat', 'irregular_sugar_level', 'cough', 'high_fever', 'sunken_eyes', 'breathlessness', 'sweating', 'dehydration', 'indigestion', 'headache', 'yellowish_skin', 'dark_urine', 'nausea', 'loss_of_appetite', 'pain_behind_the_eyes', 'back_pain', 'constipation', 'abdominal_pain']
Found candidate target: acidity acidity
0    4737
1     224
Name: count, dtype: int64
Found candidate target: prognosis prognosis
Fungal Infection       121
Allergy                121
GERD                   121
Chronic Cholestasis    121
Drug Reaction          121
Name: count, dtype: int64
Last column: prognosis prognosis
Fungal Infection       1

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Load CSV
df = pd.read_csv("symbipredict_2022.csv")

# ✅ Use 'prognosis' as target (disease name)
TARGET = "prognosis"
X = df.drop(columns=[TARGET])
y = df[TARGET]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Preprocessing: symptoms are categorical (0/1)
cat_cols = X_train.columns.tolist()
preproc = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# Model
model = Pipeline([
    ("preproc", preproc),
    ("clf", RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

# Train
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Save the model
joblib.dump(model, "disease_predictor.pkl")
print("✅ Model saved as disease_predictor.pkl")

                               precision    recall  f1-score   support

                         AIDS       1.00      1.00      1.00        24
                         Acne       1.00      1.00      1.00        24
          Alcoholic Hepatitis       1.00      1.00      1.00        24
                      Allergy       1.00      1.00      1.00        24
                    Arthritis       1.00      1.00      1.00        24
             Bronchial Asthma       1.00      1.00      1.00        24
         Cervical Spondylosis       1.00      1.00      1.00        24
                   Chickenpox       1.00      1.00      1.00        25
          Chronic Cholestasis       1.00      1.00      1.00        24
                  Common Cold       1.00      1.00      1.00        24
                       Dengue       1.00      1.00      1.00        25
                    Diabetes        1.00      1.00      1.00        24
Dimorphic Hemmorhoids (piles)       1.00      1.00      1.00        24
     

In [None]:
import pandas as pd
import joblib


model = joblib.load("disease_predictor.pkl")

# Load dataset (just to get the full list of symptom columns)
df = pd.read_csv("symbipredict_2022.csv")
TARGET = "prognosis"
all_symptoms = [c for c in df.columns if c != TARGET]

# Example patient with only a few symptoms present
patient_symptoms = {
    "Fever": 1,
    "headache": 1,
    "cough": 0
    
}

# Build a complete row with ALL symptoms
patient_full = {col: patient_symptoms.get(col, 0) for col in all_symptoms}

# Convert to DataFrame
X_new = pd.DataFrame([patient_full])

# Predict
prediction = model.predict(X_new)
print("Predicted disease:", prediction[0])

Predicted disease: Paralysis (brain hemorrhage)
