In [14]:
# Importing libraries
import numpy as np
import pandas as pd
from scipy.stats import mode
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [15]:
# Reading the train.csv by removing the last column since it's an empty column
DATA_PATH = "/home/shailesh/dataset/Training.csv"
data = pd.read_csv(DATA_PATH).dropna(axis=1)

# Checking whether the dataset is balanced or not
disease_counts = data["prognosis"].value_counts()
temp_df = pd.DataFrame({
    "Disease": disease_counts.index,
    "Counts": disease_counts.values
})

plt.figure(figsize=(18, 8))
sns.barplot(x="Disease", y="Counts", data=temp_df)
plt.xticks(rotation=90)
plt.show()

  plt.show()


In [16]:
# Encoding the target value into numerical value using LabelEncoder
encoder = LabelEncoder()
data["prognosis"] = encoder.fit_transform(data["prognosis"])


X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

print(f"Train: {X_train.shape}, {y_train.shape}")
print(f"Test: {X_test.shape}, {y_test.shape}")

Train: (3936, 132), (3936,)
Test: (984, 132), (984,)


In [17]:
# Defining scoring metric for k-fold cross-validation
def cv_scoring(estimator, X, y):
    return accuracy_score(y, estimator.predict(X))


In [18]:
# Initializing Models
models = {
    "SVC": SVC(),
    "Gaussian NB": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=18)
}


In [19]:
# Performing cross-validation and evaluating models
for model_name, model in models.items():
    scores = cross_val_score(model, X, y, cv=10, n_jobs=-1, scoring=cv_scoring)
    print("=" * 30)
    print(model_name)
    print(f"Scores: {scores}")
    print(f"Mean Score: {np.mean(scores)}")

SVC
Scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Mean Score: 1.0
Gaussian NB
Scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Mean Score: 1.0
Random Forest
Scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Mean Score: 1.0


In [20]:
# Function for hyperparameter tuning using GridSearchCV
def tune_model(model, params, X, y):
    grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X, y)
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Score: {grid_search.best_score_}")
    return grid_search.best_estimator_

In [21]:
# Tuning hyperparameters for Random Forest Classifier
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None]
}
best_rf_model = tune_model(RandomForestClassifier(random_state=18), rf_params, X_train, y_train)


Best Parameters: {'max_depth': 10, 'n_estimators': 200}
Best Score: 1.0


In [22]:
# Training and testing SVM Classifier
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)

print(f"Accuracy on train data by SVM Classifier: {accuracy_score(y_train, svm_model.predict(X_train)) * 100}")
print(f"Accuracy on test data by SVM Classifier: {accuracy_score(y_test, svm_preds) * 100}")
print(classification_report(y_test, svm_preds))

Accuracy on train data by SVM Classifier: 100.0
Accuracy on test data by SVM Classifier: 100.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        20
           3       1.00      1.00      1.00        23
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        29
           6       1.00      1.00      1.00        32
           7       1.00      1.00      1.00        24
           8       1.00      1.00      1.00        29
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        25
          11       1.00      1.00      1.00        17
          12       1.00      1.00      1.00        21
          13       1.00      1.00      1.00        27
          14       1.00      1.00      1.00        20
          15       1.00      1.00      1

In [23]:
# Training and testing Naive Bayes Classifier
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)

print(f"Accuracy on train data by Naive Bayes Classifier: {accuracy_score(y_train, nb_model.predict(X_train)) * 100}")
print(f"Accuracy on test data by Naive Bayes Classifier: {accuracy_score(y_test, nb_preds) * 100}")
print(classification_report(y_test, nb_preds))

Accuracy on train data by Naive Bayes Classifier: 100.0
Accuracy on test data by Naive Bayes Classifier: 100.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        20
           3       1.00      1.00      1.00        23
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        29
           6       1.00      1.00      1.00        32
           7       1.00      1.00      1.00        24
           8       1.00      1.00      1.00        29
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        25
          11       1.00      1.00      1.00        17
          12       1.00      1.00      1.00        21
          13       1.00      1.00      1.00        27
          14       1.00      1.00      1.00        20
          15       1.00 

In [24]:
# Training and testing Random Forest Classifier with tuned hyperparameters
best_rf_model.fit(X_train, y_train)
rf_preds = best_rf_model.predict(X_test)

print(f"Accuracy on train data by Random Forest Classifier: {accuracy_score(y_train, best_rf_model.predict(X_train)) * 100}")
print(f"Accuracy on test data by Random Forest Classifier: {accuracy_score(y_test, rf_preds) * 100}")
print(classification_report(y_test, rf_preds))


Accuracy on train data by Random Forest Classifier: 100.0
Accuracy on test data by Random Forest Classifier: 100.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        20
           3       1.00      1.00      1.00        23
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        29
           6       1.00      1.00      1.00        32
           7       1.00      1.00      1.00        24
           8       1.00      1.00      1.00        29
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        25
          11       1.00      1.00      1.00        17
          12       1.00      1.00      1.00        21
          13       1.00      1.00      1.00        27
          14       1.00      1.00      1.00        20
          15       1

In [25]:
# Training and testing Random Forest Classifier with tuned hyperparameters
best_rf_model.fit(X_train, y_train)
rf_preds = best_rf_model.predict(X_test)

print(f"Accuracy on train data by Random Forest Classifier: {accuracy_score(y_train, best_rf_model.predict(X_train)) * 100}")
print(f"Accuracy on test data by Random Forest Classifier: {accuracy_score(y_test, rf_preds) * 100}")
print(classification_report(y_test, rf_preds))


Accuracy on train data by Random Forest Classifier: 100.0
Accuracy on test data by Random Forest Classifier: 100.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        20
           3       1.00      1.00      1.00        23
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        29
           6       1.00      1.00      1.00        32
           7       1.00      1.00      1.00        24
           8       1.00      1.00      1.00        29
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        25
          11       1.00      1.00      1.00        17
          12       1.00      1.00      1.00        21
          13       1.00      1.00      1.00        27
          14       1.00      1.00      1.00        20
          15       1

In [26]:
# Function to predict disease based on selected symptoms
def predictDisease(selected_symptoms):
    # Creating input data for the models
    input_data = pd.DataFrame(columns=X.columns)
    input_data.loc[0] = 0
    for symptom in selected_symptoms:
        symptom = symptom.strip().lower()
        if symptom in input_data.columns:
            input_data[symptom] = 1

    # Generating individual outputs
    svm_prediction = data_dict["predictions_classes"][final_svm_model.predict(input_data)[0]]
    nb_prediction = data_dict["predictions_classes"][final_nb_model.predict(input_data)[0]]
    rf_prediction = data_dict["predictions_classes"][final_rf_model.predict(input_data)[0]]

    # Making final prediction by taking mode of all predictions
    final_prediction = mode([svm_prediction, nb_prediction, rf_prediction])[0][0]
    predictions = f"SVM Model Prediction: {svm_prediction}\n" \
                  f"Naive Bayes Model Prediction: {nb_prediction}\n" \
                  f"Random Forest Model Prediction: {rf_prediction}\n" \
                  f"Final Prediction: {final_prediction}"
    return predictions

In [27]:
# Create a list of symptom options for the dropdown
symptom_options = [symptom.capitalize() for symptom in X.columns.values]


In [28]:
# Create a Gradio interface
import gradio as gr

iface = gr.Interface(
    fn=predictDisease,
    inputs=gr.inputs.CheckboxGroup(choices=symptom_options, label="Symptoms"),
    outputs=gr.outputs.Textbox(label="Predictions"),
    title="Disease Predictor"
)

# Launch the interface
iface.launch()

  super().__init__(


Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




  final_prediction = mode([svm_prediction, nb_prediction, rf_prediction])[0][0]
  final_prediction = mode([svm_prediction, nb_prediction, rf_prediction])[0][0]
  final_prediction = mode([svm_prediction, nb_prediction, rf_prediction])[0][0]
  final_prediction = mode([svm_prediction, nb_prediction, rf_prediction])[0][0]
  final_prediction = mode([svm_prediction, nb_prediction, rf_prediction])[0][0]
  final_prediction = mode([svm_prediction, nb_prediction, rf_prediction])[0][0]
  final_prediction = mode([svm_prediction, nb_prediction, rf_prediction])[0][0]
  final_prediction = mode([svm_prediction, nb_prediction, rf_prediction])[0][0]
