In [10]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting click (from nltk)
  Using cached click-8.3.1-py3-none-any.whl.metadata (2.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2026.1.15-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.3-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.2-py3-none-any.whl (1.5 MB)
Downloading regex-2026.1.15-cp312-cp312-win_amd64.whl (277 kB)
Using cached click-8.3.1-py3-none-any.whl (108 kB)
Using cached tqdm-4.67.3-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, click, nltk

   ---------------------------------------- 0/4 [tqdm]
   ---------------------------------------- 0/4 [tqdm]
   ---------------------------------------- 0/4 [tqdm]
   ---------- ----------------------------- 1/4 [regex]
   ---------- ----------------------------- 1/4 [regex]
   -------------------- ------------------- 2/4 [click]
   -------------------- ---

In [38]:
!pip install scikit-learn pandas numpy joblib



In [42]:
import pandas as pd
import numpy as np
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# -------------------------------
# 1️⃣ Create Larger Dataset
# -------------------------------

data = {
    "history": [
        # Flu
        "Fever and body pain with cough",
        "High temperature and cold",
        "Chills and sore throat",
        "Cough with mild fever",

        # Food Poisoning
        "Vomiting and stomach pain",
        "Loose motion and nausea",
        "Diarrhea and abdominal cramps",
        "Food poisoning after eating outside",

        # Heart Disease
        "Chest pain and sweating",
        "Shortness of breath with chest tightness",
        "Severe chest pressure and dizziness",
        "Pain in left arm with chest discomfort",

        # Migraine
        "Severe headache and nausea",
        "Head pain with light sensitivity",
        "Throbbing headache on one side",
        "Migraine with vomiting"
    ],
    "disease": [
        "Flu","Flu","Flu","Flu",
        "Food Poisoning","Food Poisoning","Food Poisoning","Food Poisoning",
        "Heart Disease","Heart Disease","Heart Disease","Heart Disease",
        "Migraine","Migraine","Migraine","Migraine"
    ]
}

df = pd.DataFrame(data)

# -------------------------------
# 2️⃣ Train-Test Split
# -------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    df["history"],
    df["disease"],
    test_size=0.25,
    random_state=42
)

# -------------------------------
# 3️⃣ Build AI Model Pipeline
# -------------------------------

model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("classifier", RandomForestClassifier(n_estimators=200, random_state=42))
])

# -------------------------------
# 4️⃣ Train Model
# -------------------------------

model.fit(X_train, y_train)

# -------------------------------
# 5️⃣ Accuracy Check
# -------------------------------

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Test Accuracy: {:.2f}%".format(accuracy * 100))

# Cross Validation
scores = cross_val_score(model, df["history"], df["disease"], cv=4)
print("Cross Validation Accuracy: {:.2f}%".format(scores.mean() * 100))

# -------------------------------
# 6️⃣ Save Model
# -------------------------------

joblib.dump(model, "disease_ai_model.pkl")
print("Model Saved Successfully!")

# -------------------------------
# 7️⃣ Disease Prediction System
# -------------------------------

print("\n=== Disease Prediction System ===")
user_input = input("Enter Patient Case History: ")

probabilities = model.predict_proba([user_input])[0]
classes = model.classes_

top_indices = np.argsort(probabilities)[-3:][::-1]

print("\nTop Possible Diseases:")
for i in top_indices:
    print(f"{classes[i]} - {round(probabilities[i]*100, 2)}%")

Test Accuracy: 50.00%
Cross Validation Accuracy: 50.00%
Model Saved Successfully!

=== Disease Prediction System ===


Enter Patient Case History:  cancer



Top Possible Diseases:
Food Poisoning - 37.0%
Migraine - 28.5%
Heart Disease - 22.0%


In [54]:
# Predict on test data
predictions = model.predict(X_test)

# Check accuracy
print("Model Accuracy:", accuracy_score(y_test, predictions))

Model Accuracy: 0.5


In [59]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print("Model Accuracy:", accuracy)

Model Accuracy: 0.5


In [28]:
accuracy = accuracy_score(y_test, predictions)
print("Model Accuracy: {:.2f}%".format(accuracy * 100))

Model Accuracy: 0.00%


In [48]:
print("Actual Values:   ", list(y_test))
print("Predicted Values:", list(predictions))

Actual Values:    ['Flu', 'Flu', 'Food Poisoning', 'Migraine']
Predicted Values: ['Flu', 'Common Cold']


In [57]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, predictions)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[1 1 0]
 [0 0 1]
 [0 0 1]]
