<a href="https://colab.research.google.com/github/AqueeqAzam/disease-diagnosis-project-using-expert-system-ml-model-and-nlp/blob/main/ai_projects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# `Disease diagnosis project using expert system, machine learning and nlp`

`Generate random dataset`

In [32]:
import pandas as pd
import numpy as np

# Define the number of rows
n_rows = 10000

# Define the columns
columns = ["Fever", "Cough", "Headache", "Fatigue", "Disease"]

# Generate the data
data = {
    "Fever": np.random.choice([True, False], n_rows),
    "Cough": np.random.choice([True, False], n_rows),
    "Headache": np.random.choice([True, False], n_rows),
    "Fatigue": np.random.choice([True, False], n_rows),
    "Disease": np.random.choice(["COVID-19", "Flu", "Common Cold", "Healthy"], n_rows)
}

# Create the DataFrame
df = pd.DataFrame(data, columns=columns)

# Save to CSV
df.to_csv("medical_data.csv", index=False)

`diagnosis programming`

In [33]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load spacy model
nlp = spacy.load("en_core_web_sm")

# load medical data
df = pd.read_csv("medical_data.csv")

# define expert system rules
def expert_system(symptoms):
  kb = {
    "COVID-19": ["Fever", "Cough", "Fatigue"],
    "Flu": ["Fever", "Headache", "Fatigue"],
    "Common Cold": ["Cough", "Headache"],
    "Healthy": []
}


  for diagnosis, symptoms_list in kb.items():
    if all(symptom in symptoms for symptom in symptoms_list):
      return diagnosis
    else:
      return "Not disease found"

# define machine learning model
def ml_model(symptoms, data):
  # Create a TfidfVectorizer object
  vect = TfidfVectorizer()

  # Prepare the data for the model
  # Select all columns except 'Disease' for features
  x = data.drop("Disease", axis=1)

  # Convert boolean values to strings for TfidfVectorizer
  x = x.astype(str)

  # Fit and transform the training data
  x = vect.fit_transform(x.apply(''.join, axis=1))

  # Select the 'Disease' column for labels
  y = data['Disease']

  # Split the data into training and testing sets
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

  # Create and train the Random Forest model
  model = RandomForestClassifier()
  model.fit(x_train, y_train)


  # Preprocess the user symptoms and make a prediction
  processed_symptoms = ' '.join([token.text for token in nlp(symptoms) if token.is_alpha])
  symptoms_vectorized = vect.transform([processed_symptoms])
  prediction = model.predict(symptoms_vectorized)

  return prediction[0]

# Define function to analyze symptoms
def analyze_symptoms(symptoms):
  diagnosis = expert_system(symptoms)
  if diagnosis == "Not disease found":
    prediction = ml_model(symptoms, df)
    return prediction
  else:
    return diagnosis

# test the system
data = pd.read_csv("medical_data.csv")
symptoms = input("Enter your symptoms: ")
diagnosis = analyze_symptoms(symptoms)
print("Diagnosis:", diagnosis)







Enter your symptoms: Fever, Headache
Diagnosis: Common Cold
