In [None]:
!conda create -n disease_ai python=3.10

In [None]:
!pip install pandas scikit-learn joblib nltk

In [None]:
import pandas as pd
import joblib
import nltk
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [None]:
nltk.download('punkt')

In [None]:
data = {
    "history": [
        "High fever and severe cough with fatigue",
        "Severe headache and light sensitivity",
        "Fever with chills and sweating",
        "Vomiting and stomach pain",
        "Chest pain and breathing difficulty",
        "Runny nose and mild fever",
        "Body pain and high temperature",
        "Nausea and loose motion"
    ],
    "disease": [
        "Flu",
        "Migraine",
        "Malaria",
        "Food Poisoning",
        "Heart Disease",
        "Common Cold",
        "Flu",
        "Food Poisoning"
    ]
}

df = pd.DataFrame(data)
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["history"], df["disease"], test_size=0.2, random_state=42
)

model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("classifier", RandomForestClassifier(n_estimators=200, random_state=42))
])

model.fit(X_train, y_train)

predictions = model.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, predictions))

In [None]:
joblib.dump(model, "disease_model.pkl")
print("Model saved successfully!")

In [None]:
user_input = input("Enter Patient Case History: ")

probabilities = model.predict_proba([user_input])[0]
classes = model.classes_

top_indices = np.argsort(probabilities)[-3:][::-1]

print("\nTop Possible Diseases:")
for i in top_indices:
    print(f"{classes[i]} - {round(probabilities[i]*100, 2)}%")

In [None]:
Patient has high fever and body pain