In [6]:
%pip install pandas nltk scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl (11.1 MB)
Using cached scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl (22.4 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.6.1 scipy-1.15.2 threadpoolctl-3.6.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
knowledge_base = {
    "flu": "Take plenty of rest and drink fluids. Use paracetamol if needed.",
    "cold": "Stay warm, drink hot fluids, and rest.",
    "gastritis": "Eat light meals and avoid spicy food. Consider antacids.",
    "migraine": "Rest in a quiet, dark room and use pain relievers if necessary."
}

# Sample dataset for symptom classification
data = {
    "symptom": ["fever and cough", "cold and headache", "stomach pain", "cough", "headache"],
    "condition": ["flu", "cold", "gastritis", "flu", "migraine"]
}

In [8]:
import pandas as pd

df = pd.DataFrame(data)


In [9]:
# Preprocess the text data
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/balaji/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /Users/balaji/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/balaji/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# Set interated through iterable object
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
  # convert it into tokens
  text = text.lower()
  tokens = word_tokenize(text)
  # remove stop words
  filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
  return ' '.join(filtered_tokens)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Convert the text into vectors
vectorizer = CountVectorizer()
df['processed_symptom'] = df['symptom'].apply(preprocess_text)

X = vectorizer.fit_transform(df['processed_symptom'])
y = df['condition']


Xtrain, xtest, ytrain, ytest = train_test_split(X,y, test_size=0.2,random_state=42)


# Training the model 
model = MultinomialNB()
model.fit(Xtrain, ytrain)


In [14]:
# Testing

input = "headache"

# Process the text
processed_text = preprocess_text(input)
vector_embed = vectorizer.transform([processed_text])

# Predict 
predicted_res = model.predict(vector_embed) # array

identified_dise = predicted_res[0]

print(f"prediced : {identified_dise}")

print(f"Solution : {knowledge_base[identified_dise]}")


prediced : migraine
Solution : Rest in a quiet, dark room and use pain relievers if necessary.
