In [None]:
# Credit to Professor Booth for help with this notebook!

In [51]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [52]:
# Read in the data
file_path = Path("full_data.csv")
df = pd.read_csv(file_path)

# Display sample data
df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4962 entries, 0 to 4961
Columns: 133 entries, itching to prognosis
dtypes: int64(132), object(1)
memory usage: 5.0+ MB


In [54]:
# Copy the DataFrame
df2 = df.copy()

print("Original DataFrame:")
print(df.shape)

print("\nNew DataFrame created from the copy:")
print(df2.shape)

Original DataFrame:
(4962, 133)

New DataFrame created from the copy:
(4962, 133)


In [55]:
# Drop columns with null values
df = df.dropna(axis=1, how='any')
print(df.shape)
df.head()

(4962, 133)


Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [56]:
# Get the list of column names
column_names = df.columns.tolist()

print("Column names in the dataset:")
for column_name in column_names:
    print(column_name)

Column names in the dataset:
itching
skin_rash
nodal_skin_eruptions
continuous_sneezing
shivering
chills
joint_pain
stomach_pain
acidity
ulcers_on_tongue
muscle_wasting
vomiting
burning_micturition
spotting_ urination
fatigue
weight_gain
anxiety
cold_hands_and_feets
mood_swings
weight_loss
restlessness
lethargy
patches_in_throat
irregular_sugar_level
cough
high_fever
sunken_eyes
breathlessness
sweating
dehydration
indigestion
headache
yellowish_skin
dark_urine
nausea
loss_of_appetite
pain_behind_the_eyes
back_pain
constipation
abdominal_pain
diarrhoea
mild_fever
yellow_urine
yellowing_of_eyes
acute_liver_failure
fluid_overload
swelling_of_stomach
swelled_lymph_nodes
malaise
blurred_and_distorted_vision
phlegm
throat_irritation
redness_of_eyes
sinus_pressure
runny_nose
congestion
chest_pain
weakness_in_limbs
fast_heart_rate
pain_during_bowel_movements
pain_in_anal_region
bloody_stool
irritation_in_anus
neck_pain
dizziness
cramps
bruising
obesity
swollen_legs
swollen_blood_vessels
puffy_

In [57]:
# It looks like one or more of the symptoms are mispelled. Let's fix that!
df.rename(columns={'diarrhoea': 'diarrhea'}, inplace=True)
df.rename(columns={'spotting_ urination': 'spotty_urination'}, inplace=True)
df.rename(columns={'dischromic _patches': 'dischromic_patches'}, inplace=True)
# df.rename(columns={'fluid_overload.1': 'fluid_overload'}, inplace=True)
df.rename(columns={'foul_smell_of urine': 'foul_smell_of_urine'}, inplace=True)
df.rename(columns={'toxic_look_(typhos)': 'toxic_look'}, inplace=True)

df.drop(columns = ["fluid_overload.1"], inplace = True)

column_names = df.columns.tolist()

# Test for effect
print("Column names in the dataset:")
for column_name in column_names:
    print(column_name)

Column names in the dataset:
itching
skin_rash
nodal_skin_eruptions
continuous_sneezing
shivering
chills
joint_pain
stomach_pain
acidity
ulcers_on_tongue
muscle_wasting
vomiting
burning_micturition
spotty_urination
fatigue
weight_gain
anxiety
cold_hands_and_feets
mood_swings
weight_loss
restlessness
lethargy
patches_in_throat
irregular_sugar_level
cough
high_fever
sunken_eyes
breathlessness
sweating
dehydration
indigestion
headache
yellowish_skin
dark_urine
nausea
loss_of_appetite
pain_behind_the_eyes
back_pain
constipation
abdominal_pain
diarrhea
mild_fever
yellow_urine
yellowing_of_eyes
acute_liver_failure
fluid_overload
swelling_of_stomach
swelled_lymph_nodes
malaise
blurred_and_distorted_vision
phlegm
throat_irritation
redness_of_eyes
sinus_pressure
runny_nose
congestion
chest_pain
weakness_in_limbs
fast_heart_rate
pain_during_bowel_movements
pain_in_anal_region
bloody_stool
irritation_in_anus
neck_pain
dizziness
cramps
bruising
obesity
swollen_legs
swollen_blood_vessels
puffy_face

In [58]:
# List all the diseases in the dataset
print(df['prognosis'].tolist())

['Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Fungal infection', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'Allergy', 'GERD', 'GERD', 'GERD', 'GERD', 'GERD', 'GERD', 'GERD', 'GERD', 'GERD', 'GERD', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Chronic cholestasis', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Drug Reaction', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'Peptic ulcer diseae', 'P

In [59]:
# It looks like one or more of the diseases are misspelled. Let's fix that!
df.loc[df['prognosis'] == '(vertigo) Paroymsal  Positional Vertigo', 'prognosis'] = 'Paroymsal Positional Vertigo'
df.loc[df['prognosis'] == 'hepatitis A', 'prognosis'] = 'Hepatitis A'
df.loc[df['prognosis'] == 'Bronchial Asthma', 'prognosis'] = 'Asthma'

# Check the DataFrame to verify the change
print(df.prognosis.value_counts())

unique_prognosis_list2 = sorted(list(set(df['prognosis'])))
print(unique_prognosis_list2)

prognosis
Fungal infection                122
Hepatitis C                     121
Hepatitis E                     121
Alcoholic hepatitis             121
Tuberculosis                    121
Common Cold                     121
Pneumonia                       121
Dimorphic hemmorhoids(piles)    121
Heart attack                    121
Varicose veins                  121
Hypothyroidism                  121
Hyperthyroidism                 121
Hypoglycemia                    121
Osteoarthristis                 121
Arthritis                       121
Paroymsal Positional Vertigo    121
Acne                            121
Urinary tract infection         121
Psoriasis                       121
Hepatitis D                     121
Hepatitis B                     121
Allergy                         121
Hepatitis A                     121
GERD                            121
Chronic cholestasis             121
Drug Reaction                   121
Peptic ulcer diseae             121
AIDS              

In [60]:
# How many diseases are in the dataset?
# Back up the original dataset and copy a fresh one
df2 = df.copy()

# Remove duplicates in the "prognosis" column
unique_prognosis = df2['prognosis'].drop_duplicates()

# Count the number of unique diseases
num_unique_diseases = unique_prognosis.nunique()

print(f'The number of unique diseases in the "prognosis" column is: {num_unique_diseases}')

The number of unique diseases in the "prognosis" column is: 41


In [61]:
# Define features set
X = df.copy()
X.drop("prognosis", axis=1, inplace=True)
X.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
# Define target vector
y = df["prognosis"]
y[:5]

0    Fungal infection
1    Fungal infection
2    Fungal infection
3    Fungal infection
4    Fungal infection
Name: prognosis, dtype: object

In [63]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
print(X_train.shape)
print(X_test.shape)

(3721, 131)
(1241, 131)


In [64]:
def doClassification(model, X_train, X_test, y_train, y_test):
    # predict
    train_preds = model.predict(X_train)
    train_probs = model.predict_proba(X_train)

    test_preds = model.predict(X_test)
    test_probs = model.predict_proba(X_test)

    # evaluate train
    train_cr = classification_report(y_train, train_preds)
    train_cm = confusion_matrix(y_train, train_preds)
    roc_auc = roc_auc_score(y_train, train_probs, multi_class = "ovr")

    train_report = f"""
    Train Confusion Matrix: 
    {train_cm}

    Train Report: 
    {train_cr}
    
    AUC:
    {roc_auc}
    """
    print("TRAINING METRICS")
    print(train_report)
    print()



    # evaluate test
    test_cr = classification_report(y_test, test_preds)
    test_cm = confusion_matrix(y_test, test_preds)
    roc_auc = roc_auc_score(y_test, test_probs, multi_class = "ovr")


    test_report = f"""
    Test Confusion Matrix: 
    {test_cm}

    Test Report: 
    {test_cr}
    
    AUC:
    {roc_auc}
    """
    print("TESTING METRICS")
    print(test_report)
    print()



In [65]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, roc_auc_score
# initialize
dt = DecisionTreeClassifier(random_state=42)

# fit
dt.fit(X_train, y_train)

doClassification(dt, X_train, X_test, y_train, y_test)

TRAINING METRICS

    Train Confusion Matrix: 
    [[91  0  0 ...  0  0  0]
 [ 0 86  0 ...  0  0  0]
 [ 0  0 90 ...  0  0  0]
 ...
 [ 0  0  0 ... 93  0  0]
 [ 0  0  0 ...  0 87  0]
 [ 0  0  0 ...  0  0 94]]

    Train Report: 
                                  precision    recall  f1-score   support

                        AIDS       1.00      1.00      1.00        91
                        Acne       1.00      1.00      1.00        86
         Alcoholic hepatitis       1.00      1.00      1.00        90
                     Allergy       1.00      1.00      1.00        82
                   Arthritis       1.00      1.00      1.00        86
                      Asthma       1.00      1.00      1.00        97
        Cervical spondylosis       1.00      1.00      1.00        94
                 Chicken pox       1.00      1.00      1.00       102
         Chronic cholestasis       1.00      1.00      1.00        78
                 Common Cold       1.00      1.00      1.00        9

In [66]:
 # initialize
lr = LogisticRegression()

# fit
lr.fit(X_train, y_train)

doClassification(lr, X_train, X_test, y_train, y_test)

TRAINING METRICS

    Train Confusion Matrix: 
    [[91  0  0 ...  0  0  0]
 [ 0 86  0 ...  0  0  0]
 [ 0  0 90 ...  0  0  0]
 ...
 [ 0  0  0 ... 93  0  0]
 [ 0  0  0 ...  0 87  0]
 [ 0  0  0 ...  0  0 94]]

    Train Report: 
                                  precision    recall  f1-score   support

                        AIDS       1.00      1.00      1.00        91
                        Acne       1.00      1.00      1.00        86
         Alcoholic hepatitis       1.00      1.00      1.00        90
                     Allergy       1.00      1.00      1.00        82
                   Arthritis       1.00      1.00      1.00        86
                      Asthma       1.00      1.00      1.00        97
        Cervical spondylosis       1.00      1.00      1.00        94
                 Chicken pox       1.00      1.00      1.00       102
         Chronic cholestasis       1.00      1.00      1.00        78
                 Common Cold       1.00      1.00      1.00        9

In [67]:
# initialize
rf = RandomForestClassifier(random_state=42)

# fit
rf.fit(X_train, y_train)

doClassification(rf, X_train, X_test, y_train, y_test)

TRAINING METRICS

    Train Confusion Matrix: 
    [[91  0  0 ...  0  0  0]
 [ 0 86  0 ...  0  0  0]
 [ 0  0 90 ...  0  0  0]
 ...
 [ 0  0  0 ... 93  0  0]
 [ 0  0  0 ...  0 87  0]
 [ 0  0  0 ...  0  0 94]]

    Train Report: 
                                  precision    recall  f1-score   support

                        AIDS       1.00      1.00      1.00        91
                        Acne       1.00      1.00      1.00        86
         Alcoholic hepatitis       1.00      1.00      1.00        90
                     Allergy       1.00      1.00      1.00        82
                   Arthritis       1.00      1.00      1.00        86
                      Asthma       1.00      1.00      1.00        97
        Cervical spondylosis       1.00      1.00      1.00        94
                 Chicken pox       1.00      1.00      1.00       102
         Chronic cholestasis       1.00      1.00      1.00        78
                 Common Cold       1.00      1.00      1.00        9

In [68]:
  # initialize
knn = KNeighborsClassifier(n_neighbors=7)

# fit
knn.fit(X_train, y_train)

doClassification(knn, X_train.values, X_test.values, y_train, y_test)



TRAINING METRICS

    Train Confusion Matrix: 
    [[91  0  0 ...  0  0  0]
 [ 0 86  0 ...  0  0  0]
 [ 0  0 90 ...  0  0  0]
 ...
 [ 0  0  0 ... 93  0  0]
 [ 0  0  0 ...  0 87  0]
 [ 0  0  0 ...  0  0 94]]

    Train Report: 
                                  precision    recall  f1-score   support

                        AIDS       1.00      1.00      1.00        91
                        Acne       1.00      1.00      1.00        86
         Alcoholic hepatitis       1.00      1.00      1.00        90
                     Allergy       1.00      1.00      1.00        82
                   Arthritis       1.00      1.00      1.00        86
                      Asthma       1.00      1.00      1.00        97
        Cervical spondylosis       1.00      1.00      1.00        94
                 Chicken pox       1.00      1.00      1.00       102
         Chronic cholestasis       1.00      1.00      1.00        78
                 Common Cold       1.00      1.00      1.00        9

In [69]:
 # initialize
lgb = LGBMClassifier(random_state=42)

# fit
lgb.fit(X_train, y_train)

doClassification(lgb, X_train, X_test, y_train, y_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001954 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 260
[LightGBM] [Info] Number of data points in the train set: 3721, number of used features: 130
[LightGBM] [Info] Start training from score -3.710888
[LightGBM] [Info] Start training from score -3.767400
[LightGBM] [Info] Start training from score -3.721938
[LightGBM] [Info] Start training from score -3.815028
[LightGBM] [Info] Start training from score -3.767400
[LightGBM] [Info] Start training from score -3.647037
[LightGBM] [Info] Start training from score -3.678453
[LightGBM] [Info] Start training from score -3.596775
[LightGBM] [Info] Start training from score -3.865039
[LightGBM] [Info] Start training from score -3.721938
[LightGBM] [Info] Start training from score -3.721938
[LightGBM] [Info] Start training from score -3.755840












TRAINING METRICS

    Train Confusion Matrix: 
    [[91  0  0 ...  0  0  0]
 [ 0 86  0 ...  0  0  0]
 [ 0  0 90 ...  0  0  0]
 ...
 [ 0  0  0 ... 93  0  0]
 [ 0  0  0 ...  0 87  0]
 [ 0  0  0 ...  0  0 94]]

    Train Report: 
                                  precision    recall  f1-score   support

                        AIDS       1.00      1.00      1.00        91
                        Acne       1.00      1.00      1.00        86
         Alcoholic hepatitis       1.00      1.00      1.00        90
                     Allergy       1.00      1.00      1.00        82
                   Arthritis       1.00      1.00      1.00        86
                      Asthma       1.00      1.00      1.00        97
        Cervical spondylosis       1.00      1.00      1.00        94
                 Chicken pox       1.00      1.00      1.00       102
         Chronic cholestasis       1.00      1.00      1.00        78
                 Common Cold       1.00      1.00      1.00        9

In [71]:
# Let's go with Logistic Regression for it;s high accuracy and explainability
# initialize
final_model = LogisticRegression()

# fit
final_model.fit(X, y)

In [72]:
 import pickle

filename = 'final_model_lr.h5'
pickle.dump(final_model, open(filename, 'wb'))

In [73]:
df.to_csv("clean_data.csv", index = False)