In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
data = pd.read_csv('training.csv')
data.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [3]:
X = data.drop('prognosis', axis=1)
y = data['prognosis']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Label Encoding
le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train_encoded = le.transform(y_train)
y_test_encoded = le.transform(y_test)

In [6]:
# Create individual models
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgboost_model = XGBClassifier(n_estimators=100, objective='multi:softprob', random_state=42)

In [7]:
# Train individual models on the scaled training data
random_forest_model.fit(X_train_scaled, y_train_encoded)
xgboost_model.fit(X_train_scaled, y_train_encoded)

In [8]:
# Make predictions using individual models
rf_predictions = random_forest_model.predict(X_test_scaled)
xgboost_predictions = xgboost_model.predict(X_test_scaled)

In [9]:
# Create an ensemble by majority voting
ensemble_predictions = np.round((rf_predictions + xgboost_predictions) / 2)

# Evaluate the ensemble accuracy
ensemble_accuracy = accuracy_score(y_test_encoded, ensemble_predictions)
print("Ensemble Accuracy:", ensemble_accuracy)

Ensemble Accuracy: 1.0


In [10]:
print("Random Forest Accuracy:", accuracy_score(y_test_encoded, rf_predictions))
print("XGBoost Accuracy:", accuracy_score(y_test_encoded, xgboost_predictions))
print("Ensemble Accuracy:", ensemble_accuracy)

Random Forest Accuracy: 1.0
XGBoost Accuracy: 1.0
Ensemble Accuracy: 1.0


In [11]:
# def predict_disease(symptoms, model, label_encoder):
#     # Preprocess the symptoms using the same preprocessing steps as before
#     symptoms = preprocess_symptoms(symptoms)
    
#     # Convert the preprocessed symptoms into a DataFrame
#     symptoms_df = pd.DataFrame(symptoms, columns=selected_features)
    
#     # Scale the symptom data using the same scaler
#     symptoms_scaled = scaler.transform(symptoms_df)
    
#     # Make predictions using the ensemble model
#     predictions = model.predict(symptoms_scaled)
    
#     # Convert encoded predictions back to disease names
#     predicted_diseases = label_encoder.inverse_transform(predictions)
    
#     return predicted_diseases


In [12]:
# # Example usage
# new_symptoms = ["vomiting", "fever", "fatigue"]
# predicted_diseases = predict_disease(new_symptoms, ensemble, le)
# print("Predicted Diseases:", predicted_diseases)

In [15]:
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Calculate the linkage matrix
linkage_matrix = linkage(X_train_scaled, method='ward')  # You can try different linkage methods

# Plot the dendrogram to visualize hierarchical clustering
plt.figure(figsize=(10, 6))
dendrogram(linkage_matrix, labels=data['prognosis'].values, orientation='top')
plt.title('Hierarchical Clustering Dendrogram')
plt.ylabel('Distance')
plt.show()


NameError: name 'y_train_scaled' is not defined

In [16]:
def predict_disease(symptoms):
    # Convert the input symptom names into a binary format using the column names
    input_data = pd.DataFrame([[1 if col in symptoms else 0 for col in X.columns]], columns=X.columns)
    
    # Standardize the input data using the same scaler as before
    input_data_scaled = scaler.transform(input_data)
    
    # Make predictions using individual models
    rf_prediction = random_forest_model.predict(input_data_scaled)
    xgboost_prediction = xgboost_model.predict(input_data_scaled)
    
    # Create an ensemble prediction by majority voting
    ensemble_prediction = np.round((rf_prediction + xgboost_prediction) / 2).astype(int)
    
    # Decode the predicted label using the label encoder
    # predicted_disease = [le.inverse_transform([rf_prediction]),le.inverse_transform([xgboost_prediction]),le.inverse_transform([ensemble_prediction])]
    predicted_disease = {
        'randomForset' : le.inverse_transform([rf_prediction]),
        'xgboost' : le.inverse_transform([xgboost_prediction]),
        'ensemble' : le.inverse_transform([ensemble_prediction])
    }
    
    return predicted_disease

# Example usage
input_symptoms = ["high_fever", "nausea", "pain_behind_the_eyes", 'headache']
predicted_disease = predict_disease(input_symptoms)
print("Predicted Disease:", predicted_disease)

Predicted Disease: {'randomForset': array(['Dengue'], dtype=object), 'xgboost': array(['Dengue'], dtype=object), 'ensemble': array(['Dengue'], dtype=object)}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [19]:
# Import the necessary libraries and functions

def main():
    print("========================================================\nWelcome to the Health Chatbot!")
    user_name = input("May I know your name? ")
    print(f"Hello, {user_name}!\n")

    # Initialize an empty list to store symptoms
    user_symptoms = []

    while True:
        symptom = input("Are you experiencing any symptoms? (yes/no): ")
        if symptom.lower() == "no":
            break
        elif symptom.lower() == "yes":
            # Ask about a specific symptom
            symptom_name = input("Please enter the symptom you're experiencing: ")
            print('-->',symptom_name)
            user_symptoms.append(symptom_name)
        else:
            print("Invalid input. Please respond with 'yes' or 'no'.")

    # Predict the disease based on collected symptoms
    predicted_disease = predict_disease(user_symptoms)

    print(f"Based on the symptoms you provided, the predicted disease is: {predicted_disease}")
    print("Thank you for using the Health Chatbot!\n========================================================")

if __name__ == "__main__":
    main()


Welcome to the Health Chatbot!


May I know your name?  kalsd


Hello, kalsd!



Are you experiencing any symptoms? (yes/no):  yes
Please enter the symptom you're experiencing:  skin_rash


--> skin_rash


Are you experiencing any symptoms? (yes/no):  yes
Please enter the symptom you're experiencing:  headache


--> headache


Are you experiencing any symptoms? (yes/no):  no


Based on the symptoms you provided, the predicted disease is: {'randomForset': array(['Paralysis (brain hemorrhage)'], dtype=object), 'xgboost': array(['Acne'], dtype=object), 'ensemble': array(['Gastroenteritis'], dtype=object)}
Thank you for using the Health Chatbot!


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
