In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Read salary data
file_path = Path("Training.csv")
df = pd.read_csv(file_path)

# Display sample data
df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis,Unnamed: 133
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,


In [3]:
# Copy the DataFrame
df2 = df.copy()

print("Original DataFrame:")
print(df)

print("\nNew DataFrame created from the copy:")
print(df2)

Original DataFrame:
      itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  \
0           1          1                     1                    0   
1           0          1                     1                    0   
2           1          0                     1                    0   
3           1          1                     0                    0   
4           1          1                     1                    0   
...       ...        ...                   ...                  ...   
4915        0          0                     0                    0   
4916        0          1                     0                    0   
4917        0          0                     0                    0   
4918        0          1                     0                    0   
4919        0          1                     0                    0   

      shivering  chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  \
0             0       0           0            

In [4]:
# Drop columns with null values
df = df.dropna(axis=1, how='any')

df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [5]:
# Get the list of column names
column_names = df.columns.tolist()

print("Column names in the dataset:")
for column_name in column_names:
    print(column_name)

Column names in the dataset:
itching
skin_rash
nodal_skin_eruptions
continuous_sneezing
shivering
chills
joint_pain
stomach_pain
acidity
ulcers_on_tongue
muscle_wasting
vomiting
burning_micturition
spotting_ urination
fatigue
weight_gain
anxiety
cold_hands_and_feets
mood_swings
weight_loss
restlessness
lethargy
patches_in_throat
irregular_sugar_level
cough
high_fever
sunken_eyes
breathlessness
sweating
dehydration
indigestion
headache
yellowish_skin
dark_urine
nausea
loss_of_appetite
pain_behind_the_eyes
back_pain
constipation
abdominal_pain
diarrhoea
mild_fever
yellow_urine
yellowing_of_eyes
acute_liver_failure
fluid_overload
swelling_of_stomach
swelled_lymph_nodes
malaise
blurred_and_distorted_vision
phlegm
throat_irritation
redness_of_eyes
sinus_pressure
runny_nose
congestion
chest_pain
weakness_in_limbs
fast_heart_rate
pain_during_bowel_movements
pain_in_anal_region
bloody_stool
irritation_in_anus
neck_pain
dizziness
cramps
bruising
obesity
swollen_legs
swollen_blood_vessels
puffy_

In [6]:
# It looks like one or more of the symptoms are mispelled. Let's fix that!
df.rename(columns={'diarrhoea': 'diarrhea'}, inplace=True)
df.rename(columns={'spotting_ urination': 'spotty_urination'}, inplace=True)
df.rename(columns={'dischromic _patches': 'dischromic_patches'}, inplace=True)
df.rename(columns={'fluid_overload.1': 'fluid_overload'}, inplace=True)

column_names = df.columns.tolist()

# Test for effect
print("Column names in the dataset:")
for column_name in column_names:
    print(column_name)

Column names in the dataset:
itching
skin_rash
nodal_skin_eruptions
continuous_sneezing
shivering
chills
joint_pain
stomach_pain
acidity
ulcers_on_tongue
muscle_wasting
vomiting
burning_micturition
spotty_urination
fatigue
weight_gain
anxiety
cold_hands_and_feets
mood_swings
weight_loss
restlessness
lethargy
patches_in_throat
irregular_sugar_level
cough
high_fever
sunken_eyes
breathlessness
sweating
dehydration
indigestion
headache
yellowish_skin
dark_urine
nausea
loss_of_appetite
pain_behind_the_eyes
back_pain
constipation
abdominal_pain
diarrhea
mild_fever
yellow_urine
yellowing_of_eyes
acute_liver_failure
fluid_overload
swelling_of_stomach
swelled_lymph_nodes
malaise
blurred_and_distorted_vision
phlegm
throat_irritation
redness_of_eyes
sinus_pressure
runny_nose
congestion
chest_pain
weakness_in_limbs
fast_heart_rate
pain_during_bowel_movements
pain_in_anal_region
bloody_stool
irritation_in_anus
neck_pain
dizziness
cramps
bruising
obesity
swollen_legs
swollen_blood_vessels
puffy_face

In [7]:
# Let's all the diseases in the dataset
print(df['prognosis'].tolist())

['Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'GERD', 'GERD', 'GERD', 'GERD', 'GERD', 'GERD', 'GERD', 'GERD', 'GERD', 'GERD', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'P

In [8]:
# Lets remove the duplicates and arrange the list aplhabetically
unique_prognosis_list2 = sorted(list(set(df['prognosis'])))
print(unique_prognosis_list2)

['(vertigo) Paroymsal  Positional Vertigo', 'AIDS', 'Acne', 'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Bronchial Asthma', 'Cervical spondylosis', 'Chicken pox', 'Chronic cholestasis', 'Common Cold', 'Dengue', 'Diabetes ', 'Dimorphic hemmorhoids(piles)', 'Drug Reaction', 'Fungal infection', 'GERD', 'Gastroenteritis', 'Heart attack', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Hypertension ', 'Hyperthyroidism', 'Hypoglycemia', 'Hypothyroidism', 'Impetigo', 'Jaundice', 'Malaria', 'Migraine', 'Osteoarthristis', 'Paralysis (brain hemorrhage)', 'Peptic ulcer diseae', 'Pneumonia', 'Psoriasis', 'Tuberculosis', 'Typhoid', 'Urinary tract infection', 'Varicose veins', 'hepatitis A']


In [28]:
# It looks like one or more of the diseases are mispelled. Let's fix that!
df.loc[df['prognosis'] == '(vertigo) Paroymsal  Positional Vertigo', 'prognosis'] = 'Paroymsal Positional Vertigo'
df.loc[df['prognosis'] == 'hepatitis A', 'prognosis'] = 'Hepatitis A'
df.loc[df['prognosis'] == 'Bronchial Asthma', 'prognosis'] = 'Asthma'

# Check the DataFrame to verify the change
print(df)

      itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  \
0           1          1                     1                    0   
1           0          1                     1                    0   
2           1          0                     1                    0   
3           1          1                     0                    0   
4           1          1                     1                    0   
...       ...        ...                   ...                  ...   
4915        0          0                     0                    0   
4916        0          1                     0                    0   
4917        0          0                     0                    0   
4918        0          1                     0                    0   
4919        0          1                     0                    0   

      shivering  chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  \
0             0       0           0             0        0         

In [29]:
# Let's all the diseases in the dataset and make sure that it worked
unique_prognosis_list2 = sorted(list(set(df['prognosis'])))
print(unique_prognosis_list2)

['AIDS', 'Acne', 'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Asthma', 'Cervical spondylosis', 'Chicken pox', 'Chronic cholestasis', 'Common Cold', 'Dengue', 'Diabetes ', 'Dimorphic hemmorhoids(piles)', 'Drug Reaction', 'Fungal infection', 'GERD', 'Gastroenteritis', 'Heart attack', 'Hepatitis A', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Hypertension ', 'Hyperthyroidism', 'Hypoglycemia', 'Hypothyroidism', 'Impetigo', 'Jaundice', 'Malaria', 'Migraine', 'Osteoarthristis', 'Paralysis (brain hemorrhage)', 'Paroymsal Positional Vertigo', 'Peptic ulcer diseae', 'Pneumonia', 'Psoriasis', 'Tuberculosis', 'Typhoid', 'Urinary tract infection', 'Varicose veins']


In [11]:
# How many diseases are in the dataset?
# Back up the original dataset and copy a fresh one
df2 = df.copy()

# Remove duplicates in the "prognosis" column
unique_prognosis = df2['prognosis'].drop_duplicates()

# Count the number of unique diseases
num_unique_diseases = unique_prognosis.nunique()

print(f'The number of unique diseases in the "prognosis" column is: {num_unique_diseases}')

The number of unique diseases in the "prognosis" column is: 41


In [12]:
# Define features set
X = df.copy()
X.drop("prognosis", axis=1, inplace=True)
X.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Define target vector
y = df["prognosis"].values.reshape(-1, 1)
y[:5]

array([['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection']], dtype=object)

In [14]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [15]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

# Fitting the model
model = model.fit(X_train, y_train)

In [16]:
# Making predictions using the testing data
predictions = model.predict(X_test)

In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm)

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)
print("Accuracy Score:", acc_score)

Confusion Matrix:
[[24  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 43  0]
 [ 0  0  0 ...  0  0 33]]
Accuracy Score: 1.0


In [18]:
# Displaying results
print("Confusion Matrix")
display(cm)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


array([[24,  0,  0, ...,  0,  0,  0],
       [ 0, 30,  0, ...,  0,  0,  0],
       [ 0,  0, 24, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ..., 26,  0,  0],
       [ 0,  0,  0, ...,  0, 43,  0],
       [ 0,  0,  0, ...,  0,  0, 33]], dtype=int64)

Accuracy Score : 1.0
Classification Report
                              precision    recall  f1-score   support

                        AIDS       1.00      1.00      1.00        24
                        Acne       1.00      1.00      1.00        30
         Alcoholic hepatitis       1.00      1.00      1.00        24
                     Allergy       1.00      1.00      1.00        34
                   Arthritis       1.00      1.00      1.00        25
            Bronchial Asthma       1.00      1.00      1.00        33
        Cervical spondylosis       1.00      1.00      1.00        29
                 Chicken pox       1.00      1.00      1.00        25
         Chronic cholestasis       1.00      1.00      1.00        32
                 Common Cold       1.00      1.00      1.00        39
                      Dengue       1.00      1.00      1.00        27
                   Diabetes        1.00      1.00      1.00        34
Dimorphic hemmorhoids(piles)       1.00      1

In [19]:
import h5py
import pickle
import base64

# Serialize the decision tree model to a string representation
model_str = pickle.dumps(model)
model_base64 = base64.b64encode(model_str)

# Save the base64-encoded model to an HDF5 file
with h5py.File('decision_tree_model.h5', 'w') as hf:
    hf.create_dataset('model', data=model_base64)

In [20]:
# The model is saved, now let's test it!

In [21]:
# Load the saved decision tree model from the HDF5 file
with h5py.File('decision_tree_model.h5', 'r') as hf:
    model_base64 = hf['model'][()]
    model_str = base64.b64decode(model_base64)
    loaded_model = pickle.loads(model_str)

In [22]:
# Load the testing data from the CSV file
testing_data = pd.read_csv('Testing.csv')

testing_data.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Allergy
2,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,GERD
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chronic cholestasis
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Drug Reaction


In [23]:
# Perform any necessary preprocessing steps on the testing data


In [24]:
# Get the list of column names
testing_column_names = testing_data.columns.tolist()

print("Column names in the dataset:")
for column_name in testing_column_names:
    print(column_name)

Column names in the dataset:
itching
skin_rash
nodal_skin_eruptions
continuous_sneezing
shivering
chills
joint_pain
stomach_pain
acidity
ulcers_on_tongue
muscle_wasting
vomiting
burning_micturition
spotting_ urination
fatigue
weight_gain
anxiety
cold_hands_and_feets
mood_swings
weight_loss
restlessness
lethargy
patches_in_throat
irregular_sugar_level
cough
high_fever
sunken_eyes
breathlessness
sweating
dehydration
indigestion
headache
yellowish_skin
dark_urine
nausea
loss_of_appetite
pain_behind_the_eyes
back_pain
constipation
abdominal_pain
diarrhoea
mild_fever
yellow_urine
yellowing_of_eyes
acute_liver_failure
fluid_overload
swelling_of_stomach
swelled_lymph_nodes
malaise
blurred_and_distorted_vision
phlegm
throat_irritation
redness_of_eyes
sinus_pressure
runny_nose
congestion
chest_pain
weakness_in_limbs
fast_heart_rate
pain_during_bowel_movements
pain_in_anal_region
bloody_stool
irritation_in_anus
neck_pain
dizziness
cramps
bruising
obesity
swollen_legs
swollen_blood_vessels
puffy_

In [25]:
# It looks like one or more of the symptoms are mispelled in the testing data. Let's fix that!
testing_data.rename(columns={'diarrhoea': 'diarrhea'}, inplace=True)
testing_data.rename(columns={'spotting_ urination': 'spotty_urination'}, inplace=True)
testing_data.rename(columns={'dischromic _patches': 'dischromic_patches'}, inplace=True)
testing_data.rename(columns={'fluid_overload.1': 'fluid_overload'}, inplace=True)

testing_column_names = testing_data.columns.tolist()

# Test for effect
print("Column names in the dataset:")
for column_name in testing_column_names:
    print(column_name)

Column names in the dataset:
itching
skin_rash
nodal_skin_eruptions
continuous_sneezing
shivering
chills
joint_pain
stomach_pain
acidity
ulcers_on_tongue
muscle_wasting
vomiting
burning_micturition
spotty_urination
fatigue
weight_gain
anxiety
cold_hands_and_feets
mood_swings
weight_loss
restlessness
lethargy
patches_in_throat
irregular_sugar_level
cough
high_fever
sunken_eyes
breathlessness
sweating
dehydration
indigestion
headache
yellowish_skin
dark_urine
nausea
loss_of_appetite
pain_behind_the_eyes
back_pain
constipation
abdominal_pain
diarrhea
mild_fever
yellow_urine
yellowing_of_eyes
acute_liver_failure
fluid_overload
swelling_of_stomach
swelled_lymph_nodes
malaise
blurred_and_distorted_vision
phlegm
throat_irritation
redness_of_eyes
sinus_pressure
runny_nose
congestion
chest_pain
weakness_in_limbs
fast_heart_rate
pain_during_bowel_movements
pain_in_anal_region
bloody_stool
irritation_in_anus
neck_pain
dizziness
cramps
bruising
obesity
swollen_legs
swollen_blood_vessels
puffy_face

In [26]:
# Extract the features from the testing data
new_X_test = testing_data.drop('prognosis', axis=1)

# Define target vector
new_y = testing_data["prognosis"].values.reshape(-1, 1)
new_y[:5]

# Fitting the model
model = model.fit(new_X_test,new_y )

# Make predictions using the loaded model
predictions = loaded_model.predict(new_X_test)


In [27]:
# Evaluate the model performance
# You can compare the predicted values with the actual target values in the testing dataset to evaluate the model performance
# Calculating the confusion matrix
cm2 = confusion_matrix(new_y, predictions)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm)

# Calculate the accuracy score
acc_score = accuracy_score(new_y, predictions)
print("Accuracy Score:", acc_score)


Confusion Matrix:
[[24  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 43  0]
 [ 0  0  0 ...  0  0 33]]
Accuracy Score: 0.9285714285714286
