In [1]:
# Standard Libraries
import os
import re
import subprocess
import traceback
import pickle
import spacy
from collections import Counter

# Data Handling and Processing
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns  

# Scikit-Learn: Preprocessing and Model Selection
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Scikit-Learn: Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
# Scikit-Learn: Evaluation Metrics
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
)

# Scikit-Learn: Class Weights
from sklearn.utils.class_weight import compute_class_weight

# Scikit-Learn: Feature Engineering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity

# Transformers
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import pipeline   

# Imbalanced Data Handling
from imblearn.combine import SMOTEENN

# TensorFlow & Keras
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input, Dense, Dropout, Concatenate, BatchNormalization
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import (
    EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
)
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Joblib (for saving/loading models)
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from itertools import combinations

import pandas as pd
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

import gradio as gr
import speech_recognition as sr
from gtts import gTTS

import datetime


2025-03-06 19:20:02.606736: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# 1. Load and examine the data
disease_symptoms_data = pd.read_csv('Pivot_Resource/disease_symptoms.csv')


In [3]:
disease_symptoms_data.head()

Unnamed: 0,Disease,Symptoms
0,Malaria,"fever, headache, weakness, cough, vomiting"
1,Flu,"fever, headache, runny nose, muscle aches, fat..."
2,Measles,"fever, red rash, conjunctivitis, cough, rhinitis"
3,Pneumonia,"sputum production, fever, chest pain, cough, c..."
4,Tinnitus,"hissing, buzzing, clicking, roaring, ringing i..."


In [4]:
#list Symptoms
disease_symptoms_data['Symptoms'].unique()

array(['fever, headache, weakness, cough, vomiting',
       'fever, headache, runny nose, muscle aches, fatigue',
       'fever, red rash, conjunctivitis, cough, rhinitis',
       'sputum production, fever, chest pain, cough, chills, fatigue, shortness of breath',
       'hissing, buzzing, clicking, roaring, ringing in the ears',
       'vomiting, headache, fatigue, agitation',
       'frequent urination, inability to sleep through the night without urinating, urinary leakage, difficulty urinating',
       'respiratory disorder, vomiting, rapid pulse, diarrhea, low blood pressure, malaise, runny nose, abdominal pain, eczema',
       'hair loss, gradual thinning of hair',
       'disorientation in time, memory loss',
       'swollen tonsils, fever, difficulty swallowing, sore throat',
       'headache, hot flashes, absence of menstruation',
       'dry skin, hair loss, weight gain, sexual dysfunction, decreased muscle mass and strength, decreased testicular volume',
       'breathing di

In [5]:
def expand_dataset_simple(df):
    """
    Simpler approach to expand the dataset while maintaining symptom relationships
    """
    expanded_data = []
    
    for _, row in df.iterrows():
        disease = row['Disease']
        symptoms = row['Symptoms']
        
        # Clean symptoms
        symptoms = re.sub(r'[\[\]\'"]', '', symptoms)
        symptom_list = [s.strip().lower() for s in symptoms.split(',')]
        symptom_list = [s for s in symptom_list if s]
        
        # Original entry
        original_symptoms = ', '.join(symptom_list)
        expanded_data.append({
            'Disease': disease,
            'Symptoms': original_symptoms
        })
        # Create 5 variations for each disease
        for _ in range(5):
            if len(symptom_list) > 3:
                # Keep 80% of symptoms, randomly selected
                n_symptoms = max(3, int(0.8 * len(symptom_list)))
                selected_symptoms = np.random.choice(symptom_list, size=n_symptoms, replace=False)
            else:
                # For few symptoms, use all but in different order
                selected_symptoms = np.random.permutation(symptom_list)
            
            expanded_data.append({
                'Disease': disease,
                'Symptoms': ', '.join(selected_symptoms)
            })
    
    return pd.DataFrame(expanded_data)      

In [6]:
# Create expanded dataset
expanded_df = expand_dataset_simple(disease_symptoms_data)
print(f"Original dataset size: {len(disease_symptoms_data)}")
print(f"Expanded dataset size: {len(expanded_df)}")

Original dataset size: 108
Expanded dataset size: 648


In [7]:
# Print distribution of samples
disease_counts = expanded_df['Disease'].value_counts()
print("\nSamples per disease:")
print(f"Minimum samples: {disease_counts.min()}")
print(f"Maximum samples: {disease_counts.max()}")
print(f"Average samples: {disease_counts.mean():.1f}")


Samples per disease:
Minimum samples: 6
Maximum samples: 6
Average samples: 6.0


In [8]:
# Prepare features
def preprocess_symptoms(text):
    if not isinstance(text, str):
        return ''
    text = re.sub(r'[\[\]\'"]', '', text)
    symptoms = [s.strip().lower() for s in text.split(',')]
    return ' '.join([s for s in symptoms if s])


In [9]:
expanded_df.head(60)

Unnamed: 0,Disease,Symptoms
0,Malaria,"fever, headache, weakness, cough, vomiting"
1,Malaria,"weakness, fever, vomiting, headache"
2,Malaria,"weakness, fever, headache, cough"
3,Malaria,"fever, weakness, cough, vomiting"
4,Malaria,"headache, vomiting, fever, weakness"
5,Malaria,"weakness, vomiting, cough, fever"
6,Flu,"fever, headache, runny nose, muscle aches, fat..."
7,Flu,"fever, headache, muscle aches, fatigue"
8,Flu,"fever, runny nose, fatigue, muscle aches"
9,Flu,"fatigue, runny nose, fever, headache"


In [10]:
# Create features and target
X = expanded_df['Symptoms'].apply(preprocess_symptoms)
y = expanded_df['Disease']

In [11]:
# Create TF-IDF features
vectorizer = TfidfVectorizer(
    max_features=500,
    min_df=1,
    max_df=0.95,
    ngram_range=(1, 2)
)

X_vectorized = vectorizer.fit_transform(X)
print(f"\nFeature matrix shape: {X_vectorized.shape}")



Feature matrix shape: (648, 500)


In [12]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y,
    test_size=0.2,
    random_state=42
)

In [13]:
# Train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

In [14]:
# Fit the model
rf_model.fit(X_train, y_train)

In [15]:
# Make predictions
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)

In [16]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.3f}")


Model Accuracy: 0.877


In [17]:
# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
                           precision    recall  f1-score   support

               Acetonemia       1.00      1.00      1.00         2
         Acute Bronchitis       0.00      0.00      0.00         0
                 Alopecia       1.00      1.00      1.00         1
               Amenorrhea       1.00      1.00      1.00         2
                  Amnesia       1.00      1.00      1.00         3
               Andropause       1.00      1.00      1.00         3
          Angina Pectoris       1.00      0.33      0.50         3
                  Angioma       1.00      1.00      1.00         1
                 Anorexia       1.00      1.00      1.00         1
                   Anuria       1.00      0.67      0.80         3
                  Anxiety       1.00      1.00      1.00         1
             Appendicitis       1.00      0.33      0.50         3
                   Asthma       1.00      1.00      1.00         2
                   Autism       1.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
# Function for making predictions
def predict_disease(symptoms, confidence_threshold=0.30):
    """
    Make disease predictions and suggest additional symptoms to look for
    """
    # Preprocess symptoms
    processed_symptoms = preprocess_symptoms(symptoms)
    
    # Split input symptoms into a list
    input_symptoms = set(processed_symptoms.split())
    
    # Get all known symptoms from the vectorizer
    known_symptoms = set(vectorizer.get_feature_names_out())
    
    # Check if there are any known symptoms in the input
    matching_symptoms = input_symptoms.intersection(known_symptoms)
    
    # Create a dictionary of all diseases and their symptoms from your original dataset
    disease_symptom_map = {}
    for _, row in disease_symptoms_data.iterrows():
        disease = row['Disease']
        all_symptoms = set(preprocess_symptoms(row['Symptoms']).split())
        disease_symptom_map[disease] = all_symptoms
    
    # Vectorize
    X_new = vectorizer.transform([processed_symptoms])

    # Get probabilities
    proba = rf_model.predict_proba(X_new)[0]

        # Get all predictions with their probabilities
    predictions = []
    for idx, probability in enumerate(proba):
        disease = rf_model.classes_[idx]
        predictions.append((disease, probability))
    
    # Sort by probability
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Format output
    output = "Based on the symptoms you described:\n\n"
    output += f"Your symptoms: {', '.join(input_symptoms)}\n\n"
    
    # Add predictions with additional symptoms
    output += "Possible conditions and related symptoms to watch for:\n\n"
    
    for disease, prob in predictions[:5]:  # Show top 5 predictions
        # Get all symptoms for this disease
        disease_symptoms = disease_symptom_map.get(disease, set())
        
        # Find symptoms not mentioned by the user
        missing_symptoms = disease_symptoms - input_symptoms
        
        # Calculate symptom match percentage
        if disease_symptoms:
            match_percentage = len(input_symptoms.intersection(disease_symptoms)) / len(disease_symptoms) * 100
        else:    
            match_percentage = 0

        output += f"▶ {disease} (Confidence: {prob:.1%}, Symptom Match: {match_percentage:.0f}%)\n"
        
        if missing_symptoms:
            output += "   Additional symptoms to look for:\n"
            output += "   • " + "\n   • ".join(missing_symptoms) + "\n"
        else:
            output += "   ✓ All typical symptoms present\n"
        output += "\n"
    
    output += "\nImportant Notes:\n"
    output += "• The presence of additional symptoms may increase or decrease the likelihood of each condition\n"
    output += "• Some symptoms may be more significant than others for specific conditions\n"
    output += "• This is not a medical diagnosis - please consult a healthcare professional\n"
    
    return output

In [19]:
# Save the model
model_components = {
    'model': rf_model,
    'vectorizer': vectorizer
}
joblib.dump(model_components, 'disease_prediction_model.joblib')


['disease_prediction_model.joblib']

** Model Testing **

In [20]:
# Verify model and vectorizer are loaded
print("Model loaded:", rf_model is not None)
print("Vectorizer loaded:", vectorizer is not None)

Model loaded: True
Vectorizer loaded: True


In [21]:
# 1. Basic Model Testing
def test_model():
    """Test the model with various symptom combinations"""
    print("Testing Disease Prediction Model")
    print("=" * 50)
    
    # Test cases
    test_cases = [
        "fever, cough, headache",
        "joint pain, swelling, redness",
        "nausea, vomiting, dizziness",
        "chest pain, shortness of breath",
        "headache, blurred vision, dizziness"
    ]
    
    for symptoms in test_cases:
        print(f"\nTest Case: {symptoms}")
        print("-" * 50)
        
        # Get predictions
        result = predict_disease(symptoms)
        print(result)
        print("\n")


In [22]:
# 2. Test Edge Cases
def test_edge_cases():
    """Test the model's handling of edge cases"""
    print("Testing Edge Cases")
    print("=" * 50)
    
    edge_cases = [
        "",  # Empty input
        "nonexistent symptom",  # Invalid symptom
        "fever",  # Single symptom
        "FEVER, COUGH, HEADACHE",  # All caps
        "fever,cough,headache",  # No spaces
        "   fever,    cough,   headache   "  # Extra spaces
    ]
    
    for case in edge_cases:
        print(f"\nEdge Case: '{case}'")
        print("-" * 50)
        try:
            result = predict_disease(case)
            print(result)
        except Exception as e:
            print(f"Error: {str(e)}")
        print("\n")


In [23]:
# 3. Test Real-World Scenarios
def test_real_scenarios():
    """Test with real-world symptom combinations"""
    print("Testing Real-World Scenarios")
    print("=" * 50)
    
    scenarios = [
        # Common cold symptoms
        "runny nose, sore throat, cough, mild fever",
        # Flu-like symptoms
        "high fever, body aches, fatigue, headache",
        # Allergic reaction
        "rash, itching, swelling, difficulty breathing",
        # Digestive issues
        "nausea, vomiting, abdominal pain, diarrhea"
    ]
    
    for scenario in scenarios:
        print(f"\nScenario: {scenario}")
        print("-" * 50)
        result = predict_disease(scenario)
        print(result)
        print("\n")


In [24]:
# Run all tests
print("Running Model Tests...")
print("\n1. Basic Model Testing")
test_model()

print("\n2. Edge Case Testing")
test_edge_cases()

print("\n3. Real-World Scenario Testing")
test_real_scenarios()

Running Model Tests...

1. Basic Model Testing
Testing Disease Prediction Model

Test Case: fever, cough, headache
--------------------------------------------------
Based on the symptoms you described:

Your symptoms: fever, cough, headache

Possible conditions and related symptoms to watch for:

▶ Toothache (Confidence: 35.0%, Symptom Match: 50%)
   Additional symptoms to look for:
   • pain
   • swelling

▶ Malaria (Confidence: 17.0%, Symptom Match: 60%)
   Additional symptoms to look for:
   • vomiting
   • weakness

▶ Vertigo (Confidence: 6.5%, Symptom Match: 25%)
   Additional symptoms to look for:
   • nausea
   • vomiting
   • dizziness

▶ Acetonemia (Confidence: 5.0%, Symptom Match: 25%)
   Additional symptoms to look for:
   • vomiting
   • agitation
   • fatigue

▶ Anthrax (Confidence: 5.0%, Symptom Match: 25%)
   Additional symptoms to look for:
   • chills
   • malaise
   • lethargy


Important Notes:
• The presence of additional symptoms may increase or decrease the likel

**Speech to Text**

In [25]:
# Add the speech-to-text function 
def speech_to_text(audio):
    """Convert speech to text using speech recognition"""
    try:
        r = sr.Recognizer()
        with sr.AudioFile(audio) as source:
            audio_text = r.listen(source)
            text = r.recognize_google(audio_text)
            return text
    except Exception as e:
        return str(e)

def preprocess_symptoms(text):
    """Clean and preprocess the input symptoms"""
    if not isinstance(text, str):
        return ''
    text = re.sub(r'[\[\]\'"]', '', text)
    symptoms = [s.strip().lower() for s in text.split(',')]
    return ' '.join([s for s in symptoms if s])


**Text to Speech**

In [30]:
# Add the text-to-speechfunction here
def text_to_speech(text):
    tts = gTTS(text)
    tts.save("output.mp3")
    return "output.mp3"

iface = gr.Interface(
    fn=text_to_speech,
    inputs=gr.Textbox(label="Enter text"),
    outputs=gr.Audio(label="Generated Speech"),
    title="Text-to-Speech App",
    description="Enter text and generate speech using gTTS."
)



**ChatBot Function**

In [31]:
def chatbot(message, history):
    """Chatbot function for Gradio"""
    if not message:
        return "Please enter some symptoms."
    
    try:
        return predict_disease(message)
    except Exception as e:
        return f"Error processing symptoms: {str(e)}\nPlease enter symptoms separated by commas."

**Gradio Interface**

In [32]:
# Modified Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # Medical Symptom Checker
    
    **How to use:**
    1. Enter your symptoms in the text box below, separated by commas
    2. View possible conditions and additional symptoms to watch for
    3. Use this information to have a more informed discussion with your healthcare provider
    
    **Example:** fever, cough, headache
    """)
    
    with gr.Row():
        with gr.Column():
            # Text input
            text_input = gr.Textbox(
                label="Enter symptoms",
                placeholder="Example: fever, cough, headache",
                lines=3
            )
            # Voice input
            audio_input = gr.Audio(
                sources=["microphone"],
                type="filepath",
                label="Or describe symptoms by voice"
            )
            
            # Buttons
            with gr.Row():
                text_button = gr.Button("Check Symptoms")
                voice_button = gr.Button("Convert Voice to Text")
        
        with gr.Column():
            # Output with more lines for detailed information
            output = gr.Textbox(
                label="Analysis Results",
                lines=15
            )
    
    # Set up button click events
    text_button.click(
        fn=chatbot,
        inputs=[text_input],
        outputs=[output]
    )
    
    voice_button.click(
        fn=speech_to_text,
        inputs=[audio_input],
        outputs=[text_input]
    )
    
    # Examples
    gr.Examples(
        examples=[
            ["fever, cough, headache"],
            ["joint pain, swelling, stiffness"],
            ["nausea, vomiting, dizziness"]
        ],
        inputs=text_input
    )
    # Add detailed disclaimer
    gr.Markdown("""
    ### Important Notes:
    - This tool helps identify potential conditions based on symptoms
    - For each condition, additional symptoms are listed to help you better understand what to look for
    - The confidence score indicates how well your symptoms match known patterns
    - The symptom match percentage shows how many of the typical symptoms you're experiencing
    - This is not a diagnostic tool - always consult healthcare professionals for proper medical evaluation
    """)




In [33]:
# Launch the interface
if __name__ == "__main__":
    demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7869
* Running on public URL: https://9c326cf7619726e348.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


