In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier

# Load dataset
df = pd.read_csv("dataset.csv")

# Step 1: Identify all symptom columns
symptom_cols = [col for col in df.columns if "Symptom_" in col]

# Step 2: Replace NaN with empty string and merge symptom columns into a list
df[symptom_cols] = df[symptom_cols].fillna("")
df["symptoms"] = df[symptom_cols].values.tolist()

# Step 3: Remove empty strings from each list
df["symptoms"] = df["symptoms"].apply(lambda x: [s for s in x if s.strip() != ""])

# Step 4: Encode symptoms using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df["symptoms"])

# Step 5: Set target variable
y = df["Disease"]

# Step 6: Train model
model = RandomForestClassifier()
model.fit(X, y)
# Step 7: Save model and symptom list
joblib.dump(model, "model/disease_model.pkl")
joblib.dump(mlb.classes_.tolist(), "model/symptom_list.pkl")

print("✅ Model trained and saved successfully.")


✅ Model trained and saved successfully.
