In [4]:
import pandas as pd

# Load the training dataset
df = pd.read_csv('./archive/Training.csv')

print(df.head(5))


   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  scurring  \
0       0           0             0        0                 0  ...         0   
1       0           0             0        0                 0  ...         0   
2       0           0             0        0                 0  ...         0   
3       0           0             0        0                 0  ...         0   
4       0           0             0        0                 0  ...         0   

   skin_peeling  silver_like_dusting  

In [5]:
# Assuming 'df' is your DataFrame with the loaded dataset
X = df.iloc[:, :-1]  # Features (symptoms)
y = df['prognosis']   # Target variable


In [6]:
# Assuming 'X' contains your features
X_encoded = pd.get_dummies(X)


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [8]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)


In [9]:
model.fit(X_train, y_train)


In [10]:
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 1.0

Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        25
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial Asthma       1.00      1.00      1.00        33
                   Cervical spondylosis       1.00      1.00      1.00        23
                            Chicken pox       1.00      1.00      1.00        21
                    Chronic cholestasis       1.00      1.00      1.00        15
                            Common Cold       1.00      1.00      1.00

In [11]:
# Assuming 'model' is your trained Random Forest model
feature_importance = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': model.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance)


                           Feature  Importance
141  prognosis_Chronic cholestasis    0.019345
160             prognosis_Jaundice    0.018511
150         prognosis_Heart attack    0.018399
161              prognosis_Malaria    0.017206
147     prognosis_Fungal infection    0.016868
..                             ...         ...
17            cold_hands_and_feets    0.000999
76        drying_and_tingling_lips    0.000788
70             puffy_face_and_eyes    0.000759
69           swollen_blood_vessels    0.000494
45                  fluid_overload    0.000000

[173 rows x 2 columns]


In [12]:
# Assuming 'testing.csv' is your testing dataset
test_data = pd.read_csv('./archive/Testing.csv')


In [13]:
# Assuming 'model' is your trained Random Forest model
test_data_encoded = pd.get_dummies(test_data)  # Encode if needed
test_predictions = model.predict(test_data_encoded)

print("Predictions for testing data:")
print(test_predictions)


Predictions for testing data:
['Fungal infection' 'Allergy' 'GERD' 'Chronic cholestasis' 'Drug Reaction'
 'Peptic ulcer diseae' 'AIDS' 'Diabetes ' 'Gastroenteritis'
 'Bronchial Asthma' 'Hypertension ' 'Migraine' 'Cervical spondylosis'
 'Paralysis (brain hemorrhage)' 'Jaundice' 'Malaria' 'Chicken pox'
 'Dengue' 'Typhoid' 'hepatitis A' 'Hepatitis B' 'Hepatitis C'
 'Hepatitis D' 'Hepatitis E' 'Alcoholic hepatitis' 'Tuberculosis'
 'Common Cold' 'Pneumonia' 'Dimorphic hemmorhoids(piles)' 'Heart attack'
 'Varicose veins' 'Hypothyroidism' 'Hyperthyroidism' 'Hypoglycemia'
 'Osteoarthristis' 'Arthritis' '(vertigo) Paroymsal  Positional Vertigo'
 'Acne' 'Urinary tract infection' 'Psoriasis' 'Impetigo'
 'Fungal infection']


In [14]:
# Assuming 'testing.csv' is your testing dataset
testing_data = pd.read_csv('./archive/Testing.csv')

# Extract true labels from the testing dataset
true_labels = testing_data['prognosis']

# Calculate accuracy and print the classification report
accuracy_test = accuracy_score(true_labels, test_predictions)
print("Accuracy on testing data:", accuracy_test)

print("\nClassification Report:")
print(classification_report(true_labels, test_predictions))


Accuracy on testing data: 1.0

Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00    

In [None]:
%pip install ipywidgets

In [41]:
import pandas as pd
from ipywidgets import interact_manual, Dropdown

# Load the training dataset
df = pd.read_csv('./archive/Training.csv')

# Exclude 'prognosis' from the symptom list
symptoms = df.columns[:-1].tolist()

# Create a function to make predictions based on selected symptoms
def predict_disease(**selected_symptoms):
    # Ensure the input data structure is consistent with the training data
    input_data = pd.DataFrame({symptom: [0] for symptom in symptoms})
    for symptom, value in selected_symptoms.items():
        if symptom != 'prognosis':
            input_data[symptom] = int(value)

    # Assuming 'model' is your trained RandomForestClassifier model
    # Preprocess the input and make predictions
    input_encoded = pd.get_dummies(input_data)

    # Ensure that the input data columns match the feature names used during training
    input_encoded = input_encoded.reindex(columns=model.feature_names_in_, fill_value=0)

    prediction = model.predict(input_encoded)[0]

    return f"The predicted disease based on symptoms is: {prediction}"

# Create an interactive dropdown for each symptom excluding 'prognosis'
interact_manual(predict_disease, **{symptom: Dropdown(options=[0, 1], description=symptom) for symptom in symptoms if symptom != 'prognosis'})


interactive(children=(Dropdown(description='itching', options=(0, 1), value=0), Dropdown(description='skin_ras…

<function __main__.predict_disease(**selected_symptoms)>