In [10]:
import pandas as pd
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [11]:
DATA_PATH = "data/imdb_train_with_minor_disease.csv"
df = pd.read_csv(DATA_PATH)

# Clean text (same as before)
df["symptoms"] = df["symptoms"].astype(str).str.lower()

X_text = df["symptoms"]
y = df["disease"]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.2,
    random_state=42
)


In [13]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        ngram_range=(1, 2),
        max_features=6000
    )),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        random_state=42,
        n_jobs=-1
    ))
])


In [14]:
pipeline.fit(X_train, y_train)
print("âœ… Pipeline trained successfully")

âœ… Pipeline trained successfully


In [15]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"ðŸŽ¯ Model Accuracy: {accuracy * 100:.2f}%")


ðŸŽ¯ Model Accuracy: 93.95%


In [18]:
OUTPUT_DIR = "ml_models"
os.makedirs(OUTPUT_DIR, exist_ok=True)

joblib.dump(pipeline, os.path.join(OUTPUT_DIR, "model.pkl"))
print("âœ… model.pkl saved successfully (pipeline)")

âœ… model.pkl saved successfully (pipeline)


In [19]:
vocab_size = len(pipeline.named_steps["tfidf"].vocabulary_)
print("LOCAL vocab size:", vocab_size)

LOCAL vocab size: 487


In [21]:
model = joblib.load("ml_models/model.pkl")
print(model.predict(["fever headache cough"]))

['Paralysis (brain hemorrhage)']
