In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pickle
import sys


In [None]:
print("Loading data from CSV file...")
try:
    # Load the CSV file
    df = pd.read_csv("disease-datasets-cleaned.csv")
    print(f"Successfully loaded data with {len(df)} rows.")
except Exception as e:
    print(f"Error loading CSV file: {e}")

Loading data from CSV file...
Successfully loaded data with 501 rows.


In [None]:
df.shape

(501, 4)

In [4]:
df.head(20)

Unnamed: 0,Code,Name,Symptoms,Treatments
0,Code,Name,Symptoms,Treatments
1,1,Panic disorder,"""Palpitations","Sweating, Trembling, Shortness of breath, Fea..."
2,2,Vocal cord polyp,"""Hoarseness","Vocal Changes, Vocal Fatigue"",""Voice Rest, Sp..."
3,3,Turner syndrome,"""Short stature","Gonadal dysgenesis, Webbed neck, Lymphedema"",..."
4,4,Cryptorchidism,"""Absence or undescended testicle(s)","empty scrotum, smaller or underdeveloped test..."
5,5,Ethylene glycol poisoning-1,"""Nausea","vomiting, abdominal pain, General malaise, we..."
6,6,Ethylene glycol poisoning-2,"""Metabolic acidosis","apid breathing, rapid heart rate, confusion, ..."
7,7,Ethylene glycol poisoning-3,"""Decreased urine output","swelling in the legs and ankles, and signs of..."
8,8,Atrophic vaginitis,"""Vaginal dryness","Vaginal burning, frequent urination, urinary ..."
9,9,Fracture,"""Pain","Swelling, Bruising, Deformity, Difficulty mov..."


In [5]:
# Display some basic information about the dataset
print("\nDataset columns:", df.columns.tolist())
print(f"Number of unique diseases: {df['Name'].nunique()}")


Dataset columns: ['Code', 'Name', 'Symptoms', 'Treatments']
Number of unique diseases: 464


In [6]:
print([repr(col) for col in df.columns])



["'Code'", "'Name'", "'Symptoms'", "'Treatments'"]


In [7]:
df.columns = df.columns.str.strip()


In [8]:
print(df.head())  # Check if 'Symptoms' appears correctly
print(df.shape)   # Ensure it's not empty


   Code              Name                             Symptoms  \
0  Code              Name                             Symptoms   
1     1    Panic disorder                        "Palpitations   
2     2  Vocal cord polyp                          "Hoarseness   
3     3   Turner syndrome                       "Short stature   
4     4    Cryptorchidism  "Absence or undescended testicle(s)   

                                          Treatments  
0                                         Treatments  
1   Sweating, Trembling, Shortness of breath, Fea...  
2   Vocal Changes, Vocal Fatigue","Voice Rest, Sp...  
3   Gonadal dysgenesis, Webbed neck, Lymphedema",...  
4   empty scrotum, smaller or underdeveloped test...  
(501, 4)


In [9]:
# Function to clean and extract individual symptoms
def extract_symptoms(symptom_text):
    if not isinstance(symptom_text, str):
        return []
    # Split by comma, clean and normalize each symptom
    symptoms = re.split(r',|\(|\)', symptom_text)
    symptoms = [s.strip().lower() for s in symptoms if s.strip()]
    return symptoms


In [10]:
# Function to clean and extract treatments
def extract_treatments(treatment_text):
    if not isinstance(treatment_text, str):
        return []
    treatments = [t.strip() for t in treatment_text.split(',') if t.strip()]
    return treatments


In [11]:
print("\nProcessing symptoms and treatments...")
# Process symptoms and treatments
df['symptom_list'] = df['Symptoms'].apply(extract_symptoms)
df['treatment_list'] = df['Treatments'].apply(extract_treatments)



Processing symptoms and treatments...


In [12]:
# Create a list of all unique symptoms and treatments
all_symptoms = []
for symptoms in df['symptom_list']:
    all_symptoms.extend(symptoms)
unique_symptoms = sorted(list(set(all_symptoms)))


In [13]:
all_treatments = []
for treatments in df['treatment_list']:
    all_treatments.extend(treatments)
unique_treatments = sorted(list(set(all_treatments)))

In [14]:
print(f"Total unique symptoms: {len(unique_symptoms)}")
print(f"Total unique treatments: {len(unique_treatments)}")


Total unique symptoms: 345
Total unique treatments: 2579


In [15]:
# Binarize symptoms (multi-hot encoding)
print("\nEncoding symptoms...")
mlb = MultiLabelBinarizer()
symptom_features = mlb.fit_transform(df['symptom_list'])
symptom_feature_names = mlb.classes_



Encoding symptoms...


In [16]:
# Create DataFrame for symptom features
symptom_df = pd.DataFrame(symptom_features, columns=symptom_feature_names)

In [17]:
# Create TF-IDF vectorizer for symptoms to handle new symptoms
print("Creating TF-IDF representation of symptoms...")
tfidf_vectorizer = TfidfVectorizer(max_features=100)
symptoms_text = [' '.join(symptoms) for symptoms in df['symptom_list']]
tfidf_symptoms = tfidf_vectorizer.fit_transform(symptoms_text)

Creating TF-IDF representation of symptoms...


In [18]:
# Create clusters of diseases based on symptoms using KMeans
print("\nClustering diseases based on symptom patterns...")
# Determine number of clusters based on dataset size (adjust as needed)
n_clusters = min(max(5, len(df) // 20), 50)  # Between 5 and 50 clusters
print(f"Creating {n_clusters} disease clusters...")

kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(tfidf_symptoms)


Clustering diseases based on symptom patterns...
Creating 25 disease clusters...


In [19]:
# Print cluster information
cluster_counts = df['cluster'].value_counts().sort_index()
print("Diseases per cluster:", cluster_counts.to_dict())

Diseases per cluster: {0: 7, 1: 38, 2: 18, 3: 19, 4: 19, 5: 12, 6: 15, 7: 163, 8: 9, 9: 13, 10: 11, 11: 10, 12: 14, 13: 11, 14: 26, 15: 7, 16: 7, 17: 38, 18: 17, 19: 8, 20: 7, 21: 5, 22: 13, 23: 6, 24: 8}


In [20]:
# Split the data for training
print("\nSplitting data for training (70%) and testing (30%)...")
X = symptom_df
y = df['Name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Splitting data for training (70%) and testing (30%)...


In [21]:
# Train Random Forest Classifier
print("\nTraining Random Forest Classifier...")
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)


Training Random Forest Classifier...


In [22]:
# Evaluate model
print("\nEvaluating model performance...")
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))


Evaluating model performance...
Model Accuracy: 0.06
                                                precision    recall  f1-score   support

                                   "Bronchitis       0.00      0.00      0.00         1
                       "Carpal Tunnel Syndrome       0.00      0.00      0.00         0
                                   "Chickenpox       1.00      1.00      1.00         1
                                      "Cholera       0.00      0.00      0.00         1
                                  "Common Cold       0.00      0.00      0.00         1
                                 "Dengue Fever       0.00      0.00      0.00         0
                                   "Diphtheria       0.00      0.00      0.00         1
                                "Ear Infection       1.00      1.00      1.00         1
                                       "Eczema       0.00      0.00      0.00         0
                               "Food Poisoning       0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
# Feature importance
print("\nTop 10 most important symptoms for prediction:")
feature_importance = pd.DataFrame({
    'Symptom': symptom_feature_names,
    'Importance': rf_classifier.feature_importances_
}).sort_values('Importance', ascending=False)
print(feature_importance.head(10))


Top 10 most important symptoms for prediction:
                         Symptom  Importance
265                      fatigue    0.008387
320  severe pain in back or side    0.008033
305             persistent cough    0.007641
282                      itching    0.007594
277                   high fever    0.007568
252    difficulty falling asleep    0.007201
272                    heartburn    0.007108
294                       nausea    0.007087
180                     "redness    0.006841
300                 painful rash    0.006822


In [24]:
# Save the model components
print("\nSaving model to disk...")
model_data = {
    'rf_model': rf_classifier,
    'mlb': mlb,
    'vectorizer': tfidf_vectorizer,
    'kmeans': kmeans,
    'symptom_feature_names': symptom_feature_names,
    'disease_data': df
}


Saving model to disk...


In [None]:
with open('prediction_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("Model saved successfully as 'disease_prediction_model.pkl'")


Model saved successfully as 'disease_prediction_model.pkl'


In [26]:
# Function for prediction
def predict_disease(symptoms_input, rf_model, mlb, df, vectorizer, kmeans):
    """
    Predict disease and treatments based on input symptoms.
    Can handle both known and unknown symptoms.
    """
    # Preprocess input symptoms
    input_symptoms = [s.strip().lower() for s in symptoms_input.split(',')]
    
    # Create feature vector for known symptoms
    symptom_vector = np.zeros(len(mlb.classes_))
    known_symptoms = []
    unknown_symptoms = []
    
    for symptom in input_symptoms:
        if symptom in mlb.classes_:
            idx = np.where(mlb.classes_ == symptom)[0]
            if len(idx) > 0:
                symptom_vector[idx[0]] = 1
                known_symptoms.append(symptom)
        else:
            unknown_symptoms.append(symptom)
    
    # Track confidence level
    confidence = 1.0
    prediction_method = "direct"
    
    # If we have too many unknown symptoms, use clustering approach
    if len(unknown_symptoms) > len(known_symptoms) // 2 and len(known_symptoms) > 0:
        # Create a TF-IDF vector for the input symptoms
        input_text = ' '.join(input_symptoms)
        input_tfidf = vectorizer.transform([input_text])
        cluster = kmeans.predict(input_tfidf)[0]
        
        # Get diseases in the same cluster
        cluster_diseases = df[df['cluster'] == cluster]['Name'].tolist()
        
        if cluster_diseases:
            print(f"Based on symptom patterns, possible diseases in cluster {cluster}: {', '.join(cluster_diseases[:5])}")
            
            # Get common treatments for the cluster
            cluster_treatments = []
            for treatments in df[df['cluster'] == cluster]['treatment_list']:
                cluster_treatments.extend(treatments)
            
            # Get most common treatments
            treatment_counts = pd.Series(cluster_treatments).value_counts()
            common_treatments = treatment_counts.index[:5].tolist()
            
            # First try to find a disease within the cluster that matches known symptoms
            if known_symptoms:
                for disease in cluster_diseases:
                    disease_symptoms = set(df[df['Name'] == disease]['symptom_list'].iloc[0])
                    if any(symptom in disease_symptoms for symptom in known_symptoms):
                        confidence = 0.7  # Moderate confidence
                        prediction_method = "cluster + symptom match"
                        treatments = df[df['Name'] == disease]['treatment_list'].iloc[0]
                        return disease, treatments, confidence, prediction_method, unknown_symptoms
            
            # Fall back to most frequent disease in cluster
            confidence = 0.5  # Lower confidence
            prediction_method = "cluster-based"
            return cluster_diseases[0], common_treatments, confidence, prediction_method, unknown_symptoms
    
    # If we have enough known symptoms, use the random forest model
    if np.sum(symptom_vector) > 0:
        # Get prediction probabilities
        probabilities = rf_model.predict_proba([symptom_vector])[0]
        max_prob_idx = np.argmax(probabilities)
        max_prob = probabilities[max_prob_idx]
        
        # Get predicted disease
        disease = rf_model.classes_[max_prob_idx]
        
        # Get treatments
        treatments = df[df['Name'] == disease]['treatment_list'].iloc[0]
        
        # Adjust confidence based on prediction probability and unknown symptoms
        confidence = max_prob
        if unknown_symptoms:
            confidence *= (1 - 0.1 * len(unknown_symptoms))
        
        prediction_method = "random forest"
        return disease, treatments, confidence, prediction_method, unknown_symptoms
    
    # If all symptoms are unknown, use a symptom similarity approach
    if unknown_symptoms:
        print("\nAttempting to match unknown symptoms to known symptoms:")
        # Find the closest symptom for each unknown symptom
        for unknown in unknown_symptoms:
            similarities = []
            for known in mlb.classes_:
                # Simple string similarity (could be improved)
                common_chars = set(unknown) & set(known)
                similarity = len(common_chars) / (len(set(unknown)) + len(set(known)) - len(common_chars))
                similarities.append((known, similarity))
            
            closest_symptom = max(similarities, key=lambda x: x[1])
            print(f"  - Unknown symptom '{unknown}' might be similar to '{closest_symptom[0]}' (similarity: {closest_symptom[1]:.2f})")
            
            # Add this symptom to the vector if similarity is high enough
            if closest_symptom[1] > 0.3:  # Threshold can be adjusted
                idx = np.where(mlb.classes_ == closest_symptom[0])[0]
                if len(idx) > 0:
                    symptom_vector[idx[0]] = closest_symptom[1] * 0.8  # Lower confidence based on similarity
        
        if np.sum(symptom_vector) > 0:
            # Get prediction
            probabilities = rf_model.predict_proba([symptom_vector])[0]
            max_prob_idx = np.argmax(probabilities)
            disease = rf_model.classes_[max_prob_idx]
            treatments = df[df['Name'] == disease]['treatment_list'].iloc[0]
            
            confidence = probabilities[max_prob_idx] * 0.6  # Lower confidence due to approximation
            prediction_method = "similarity matching"
            return disease, treatments, confidence, prediction_method, unknown_symptoms
    
    return "Unknown disease", ["Consult a healthcare professional immediately"], 0.1, "fallback", unknown_symptoms


In [None]:
# Run interactive prediction system
print("\n" + "="*50)
print("Disease Prediction System")
print("="*50)
print("Enter symptoms separated by commas (or 'quit' to exit):")

while True:
    user_input = input("\nEnter symptoms: ")
    if user_input.lower() == 'quit':
        break
    
    disease, treatments, confidence, method, unknown = predict_disease(
        user_input, rf_classifier, mlb, df, tfidf_vectorizer, kmeans
    )
    
    print("\nPrediction Results:")
    print(f"Predicted Disease: {disease}")
    print("\n------------------------------------------------------------")
    print(f"Confidence: {confidence:.2f} (using {method} method)")
    print("\n------------------------------------------------------------")
    print(f"Recommended Treatments: {', '.join(treatments)}")
    
    
    if unknown:
        print(f"\nNote: {len(unknown)} symptoms were not recognized in our database.")
        print("This may affect prediction accuracy.")
        
    if confidence < 0.6:
        print("\nWARNING: Low confidence prediction.")
        print("Please consult a healthcare professional for accurate diagnosis.")
        print("\n=====================================================================")
    else:
        print("\nReminder: This is an automated prediction system.")
        print("Always consult healthcare professionals for proper diagnosis and treatment.")
        print("\n=====================================================================")

print("\nProgram completed.")


Disease Prediction System
Enter symptoms separated by commas (or 'quit' to exit):



Enter symptoms:  fever, headache, problem in breathing





Prediction Results:
Predicted Disease:  "HIV/AIDS

------------------------------------------------------------
Confidence: 0.51 (using random forest method)

------------------------------------------------------------
Recommended Treatments: Fatigue, Swollen lymph nodes, Weight loss, Skin rashes", "Antiretroviral therapy (ART), Regular monitoring, Healthy lifestyle"

Note: 1 symptoms were not recognized in our database.
This may affect prediction accuracy.

Please consult a healthcare professional for accurate diagnosis.



In [37]:
# Run interactive prediction system
print("\n" + "="*50)
print("Disease Prediction System")
print("="*50)
print("Enter symptoms separated by commas (or 'quit' to exit):")

while True:
    user_input = input("\nEnter symptoms: ")
    if user_input.lower() == 'quit':
        break
    
    disease, treatments, confidence, method, unknown = predict_disease(
        user_input, rf_classifier, mlb, df, tfidf_vectorizer, kmeans
    )
    
    print("\nPrediction Results:")
    print(f"Predicted Disease: {disease}")
    print("\n------------------------------------------------------------")
    print(f"Confidence: {confidence:.2f} (using {method} method)")
    print("\n------------------------------------------------------------")
    print(f"Recommended Treatments: {', '.join(treatments)}")
    
    
    if unknown:
        print("\n********")
        print(f"\nNote: {len(unknown)} symptoms were not recognized in our database.")
        print("This may affect prediction accuracy.")
        
    if confidence < 0.6:
        print("\n********")
        print("\nWARNING: Low confidence prediction.")
        print("Please consult a healthcare professional for accurate diagnosis.")
        print("\n=====================================================================")
    else:
         print("\n********")
        print("\nReminder: This is an automated prediction system.")
        print("Always consult healthcare professionals for proper diagnosis and treatment.")
        print("\n=====================================================================")

print("\nProgram completed.")
 print("\n====\====\=====\====\=====\====\=====\=====\=====\=====\=====\=====\=====\====\=====")


Disease Prediction System
Enter symptoms separated by commas (or 'quit' to exit):



Enter symptoms:  High fever, Severe headache, Pain behind eyes, Joint and muscle pain, Rash


Based on symptom patterns, possible diseases in cluster 3: Angle-Closure Glaucoma, Complex Regional Pain Syndrome (CRPS), Ischemia of the Bowel, Peritonitis, Acute Pancreatitis

Prediction Results:
Predicted Disease: Angle-Closure Glaucoma

------------------------------------------------------------
Confidence: 0.50 (using cluster-based method)

------------------------------------------------------------
Recommended Treatments: swelling, pain management, nausea, fever, vomiting

Note: 3 symptoms were not recognized in our database.
This may affect prediction accuracy.

Please consult a healthcare professional for accurate diagnosis.




Enter symptoms:  Cough with mucus, Fever, Shortness of breath, Chest pain





Prediction Results:
Predicted Disease:  "Pneumonia

------------------------------------------------------------
Confidence: 0.34 (using random forest method)

------------------------------------------------------------
Recommended Treatments: Fever, Shortness of breath, Chest pain", "Antibiotics (if bacterial), Rest and hydration, Pain relievers (e.g., ibuprofen), Oxygen therapy (if severe)"

Note: 1 symptoms were not recognized in our database.
This may affect prediction accuracy.

Please consult a healthcare professional for accurate diagnosis.




Enter symptoms:  Itchy rash, Fever, Fatigue, Loss of appetite





Prediction Results:
Predicted Disease:  "Multiple Sclerosis

------------------------------------------------------------
Confidence: 0.29 (using random forest method)

------------------------------------------------------------
Recommended Treatments: Numbness, Tingling, Muscle weakness, Vision problems", "Disease-modifying therapies, Physical therapy, Symptom management medications"

Note: 1 symptoms were not recognized in our database.
This may affect prediction accuracy.

Please consult a healthcare professional for accurate diagnosis.




Enter symptoms:  Sneezing, Runny or stuffy nose, Itchy eyes, Watery eyes


Based on symptom patterns, possible diseases in cluster 7: Name, Panic disorder, Vocal cord polyp, Turner syndrome, Ethylene glycol poisoning-2

Prediction Results:
Predicted Disease:  "Hay Fever (Allergic Rhinitis)

------------------------------------------------------------
Confidence: 0.70 (using cluster + symptom match method)

------------------------------------------------------------
Recommended Treatments: Runny or stuffy nose, Itchy eyes, Watery eyes", "Antihistamines (e.g., cetirizine), Nasal corticosteroids, Avoid allergens, Use of air purifiers"

Note: 3 symptoms were not recognized in our database.
This may affect prediction accuracy.

Reminder: This is an automated prediction system.
Always consult healthcare professionals for proper diagnosis and treatment.




Enter symptoms:   Persistent cough, Mucus production, Fatigue, Shortness of breath, Chest discomfort


Based on symptom patterns, possible diseases in cluster 18: Pyloric stenosis, Pica, Postpartum Depression, Pulmonary Eosinophilia, Priapism

Prediction Results:
Predicted Disease:  "Bronchitis

------------------------------------------------------------
Confidence: 0.70 (using cluster + symptom match method)

------------------------------------------------------------
Recommended Treatments: Mucus production, Fatigue, Shortness of breath, Chest discomfort", "Rest and hydration, Cough suppressants, Inhalers for wheezing, Antibiotics (if bacterial)"

Note: 2 symptoms were not recognized in our database.
This may affect prediction accuracy.

Reminder: This is an automated prediction system.
Always consult healthcare professionals for proper diagnosis and treatment.




Enter symptoms:  quit



Program completed.


In [None]:
# Run interactive prediction system
print("\n" + "="*50)
print("Disease Prediction System")
print("="*50)
print("Enter symptoms separated by commas (or 'quit' to exit):")

while True:
    user_input = input("\nEnter symptoms: ")
    if user_input.lower() == 'quit':
        break
    
    disease, treatments, confidence, method, unknown = predict_disease(
        user_input, rf_classifier, mlb, df, tfidf_vectorizer, kmeans
    )
    
    print("\nPrediction Results:")
    print(f"Predicted Disease: {disease}")
    print("\n------------------------------------------------------------")
    print(f"Confidence: {confidence:.2f} (using {method} method)")
    print("\n------------------------------------------------------------")
    print(f"Recommended Treatments: {', '.join(treatments)}")
    print("\n------------------------------------------------------------")
    
    
    if unknown:
        print("\n********")
        print(f"\nNote: {len(unknown)} symptoms were not recognized in our database.")
        print("This may affect prediction accuracy.")
        
    if confidence < 0.6:
        print("\n********")
        print("\nWARNING: Low confidence prediction.")
        print("Please consult a healthcare professional for accurate diagnosis.")
        print("\n=====================================================================")
    else:
        print("\n********")
        print("\nReminder: This is an automated prediction system.")
        print("Always consult healthcare professionals for proper diagnosis and treatment.")
        print("\n=====================================================================")

print("\nProgram completed.")
print("\n====\====\=====\====\=====\====\=====\=====\=====\=====\=====\=====\=====\====\=====")


Disease Prediction System
Enter symptoms separated by commas (or 'quit' to exit):



Enter symptoms:  Pain, Swelling, Redness, Stiffness


Based on symptom patterns, possible diseases in cluster 5: Cellulitis, Thrombophlebitis, Infection of Open Wound, Dermatitis due to Sun Exposure, Scleritis

Prediction Results:
Predicted Disease: Cellulitis

------------------------------------------------------------
Confidence: 0.50 (using cluster-based method)

------------------------------------------------------------
Recommended Treatments: swelling, Swelling, spicy foods)", "Topical medications (e.g., itching

********

Note: 3 symptoms were not recognized in our database.
This may affect prediction accuracy.

********

Please consult a healthcare professional for accurate diagnosis.

