In [7]:
import pandas as pd 
import numpy as np 
from tkinter import * 
from tkinter import ttk 
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.metrics import accuracy_score 
import matplotlib.pyplot as plt 

# Import all necessary models and data mappings from the setup file
import import_ipynb
from data_setup import symptoms, index_to_disease, dt_clf, rf_clf, nb_clf, X_valid, y_valid, df

# --- Prediction Function ---
def predict_disease(model, input_symptoms): 
    # Create an input vector of zeros (representing all symptoms)
    input_vector = [0] * len(symptoms) 
    
    # Set 1 for the symptoms present
    for symptom in input_symptoms: 
        if symptom in symptoms: 
            # get_loc is used to find the index of the symptom column
            input_vector[symptoms.get_loc(symptom)] = 1 
            
    # Reshape and predict
    # Note: The model.predict expects a 2D array, hence the [input_vector]
    pred = model.predict([input_vector])[0] 
    
    # Return the disease name using the mapping
    return index_to_disease.get(pred, "Not Found") 

# --- GUI and Utility Functions ---

def show_graphs(): 
    # Calculate accuracy for each model on the validation set
    dt_acc = accuracy_score(y_valid, dt_clf.predict(X_valid)) 
    rf_acc = accuracy_score(y_valid, rf_clf.predict(X_valid)) 
    nb_acc = accuracy_score(y_valid, nb_clf.predict(X_valid)) 
    
    models = ["Decision Tree", "Random Forest", "Naive Bayes"] 
    scores = [dt_acc, rf_acc, nb_acc]
    
    # Create a figure with two subplots
    plt.figure(figsize=(12, 5)) 
    
    # Subplot 1: Model Accuracy Comparison
    plt.subplot(1, 2, 1) 
    plt.bar(models, scores, color=['red', 'green', 'purple']) 
    plt.title("Model Accuracy Comparison (Validation Set)") 
    plt.ylabel("Accuracy") 
    plt.ylim(0, 1) 
    
    # Subplot 2: Top 6 Diseases in Training Set (as a Pie Chart)
    # Reloading the original dataframe to use disease names
    df_original = pd.read_csv("Training.csv") 
    disease_counts = df_original["prognosis"].value_counts() 
    
    # Get the top 6 diseases by count
    top_diseases = disease_counts[:6] 
    plt.pie(top_diseases, labels=top_diseases.index, autopct='%1.1f%%', startangle=140) 
    plt.title("Top 6 Diseases in Training Set") 
    
    # Adjust layout and show the plot
    plt.tight_layout() 
    plt.show() 