In [1]:
# Import Libraries and Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  
from sklearn.model_selection import train_test_split    # To split the dataset into train and test sets
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder   # For scaling and encoding categorical variables
from sklearn.compose import ColumnTransformer           # To apply transformers to specific columns
from sklearn.impute import SimpleImputer                # For handling missing data
from sklearn.ensemble import RandomForestClassifier     # Random Forest Classifier
from sklearn.linear_model import LogisticRegression     # Logistic Regression
from sklearn.svm import SVC                            # Support Vector Classifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score # For evaluating the model
import joblib
from xgboost import XGBClassifier

In [3]:
#read in the disease_sypmtoms.csv file
disease_df = pd.read_csv('Resources/disease_symptoms.csv')

#display the first 5 rows of the dataframe
disease_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Resources/disease_symptoms.csv'

In [None]:
#drop nulls in the disease_df dateframe
disease_df = disease_df.dropna()

In [None]:
#list the number of columns and rows in the dataframe
disease_df.shape

(108, 2)

In [None]:
#check the data types of the columns in the dataframe
disease_df.dtypes

Disease     object
Symptoms    object
dtype: object

In [None]:
disease_df.describe()

Unnamed: 0,Disease,Symptoms
count,108,108
unique,108,107
top,Yeast Infection,"pain, vaginal discharge, burning sensation, it..."
freq,1,2


In [None]:
disease_df.info()   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Disease   108 non-null    object
 1   Symptoms  108 non-null    object
dtypes: object(2)
memory usage: 1.8+ KB


In [None]:
#rename Disease column to Prognosis
disease_df.rename(columns = {'Disease': 'Prognosis'}, inplace = True)
disease_df.head()

Unnamed: 0,Prognosis,Symptoms
0,Malaria,"fever, headache, weakness, cough, vomiting"
1,Flu,"fever, headache, runny nose, muscle aches, fat..."
2,Measles,"fever, red rash, conjunctivitis, cough, rhinitis"
3,Pneumonia,"sputum production, fever, chest pain, cough, c..."
4,Tinnitus,"hissing, buzzing, clicking, roaring, ringing i..."


In [None]:
# Step 1: Split Symptoms into separate rows
disease_df['Symptoms'] = disease_df['Symptoms'].str.split(', ')  # Convert string to list
disease_df = disease_df.explode('Symptoms')  # Create separate rows for each symptom

# Step 2: Reset index to keep the structure intact
disease_df.reset_index(drop=True, inplace=True)
disease_df.tail()   

Unnamed: 0,Prognosis,Symptoms
506,Whiplash,neck pain
507,Yeast Infection,pain
508,Yeast Infection,vaginal discharge
509,Yeast Infection,burning sensation
510,Yeast Infection,itching


In [None]:
## Split Symptoms column into separate rows while keeping Prognosis intact
# disease_df = disease_df.set_index(['Prognosis']).stack().str.split(',', expand=True).stack().unstack(-2).reset_index(-1, drop=True).reset_index()
# disease_df.head()

In [None]:
# #split up the Syptoms column into individual columns
# disease_df = disease_df.join(disease_df['Symptoms'].str.split(',', expand=True).add_prefix('Symptom_')) 
# disease_df.head()

In [None]:
#make the Symptoms columns into a list  
# disease_df['Symptoms'] = disease_df['Symptoms'].str.split(',')

In [None]:
# #one hot encode the Sypmtoms column
# disease_df = pd.get_dummies(disease_df, columns=['Symptoms'])
# disease_df.head()

In [None]:
#read in the Symptom2Disease.csv file
symptom_to_disease_df = pd.read_csv('Resources/Symptom2Disease.csv')

#display the first 5 rows of the dataframe
symptom_to_disease_df.tail()

Unnamed: 0.1,Unnamed: 0,label,text
1195,295,diabetes,I'm shaking and trembling all over. I've lost ...
1196,296,diabetes,"Particularly in the crevices of my skin, I hav..."
1197,297,diabetes,I regularly experience these intense urges and...
1198,298,diabetes,"I have trouble breathing, especially outside. ..."
1199,299,diabetes,I constantly sneeze and have a dry cough. My i...


In [None]:
#drop nulls in the symptom_to_disease_df dateframe
symptom_to_disease_df = symptom_to_disease_df.dropna()
symptom_to_disease_df.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [None]:
#drop 'Unnamed: 0' column from the symptom_to_disease dataframe
symptom_to_disease_df.drop('Unnamed: 0', axis=1, inplace=True)
symptom_to_disease_df.head()

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


In [None]:
#change label column to Prognosis and text column to symptoms
symptom_to_disease_df.rename(columns = {'label': 'Prognosis', 'text': 'Symptoms'}, inplace = True)
symptom_to_disease_df.head()    

Unnamed: 0,Prognosis,Symptoms
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


In [None]:
#check shape of the symptom_to_disease dataframe
symptom_to_disease_df.shape

(1200, 2)

In [None]:
#check data types of the columns in the symptom_to_disease dataframe
symptom_to_disease_df.dtypes

Prognosis    object
Symptoms     object
dtype: object

In [None]:
symptom_to_disease_df.describe()

Unnamed: 0,Prognosis,Symptoms
count,1200,1200
unique,24,1153
top,Psoriasis,"I've been feeling extremely scratchy, sick, an..."
freq,50,4


In [None]:
#read in the Final_Train_Data.csv file
final_train_df = pd.read_csv('Resources/Final_Train_Data.csv')

#display the first 5 rows of the dataframe
final_train_df.head()

Unnamed: 0.1,Unnamed: 0,pain chest,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat sweating increased,palpitation,...,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts,prognosis
0,0,0,1,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,hypertensive disease
1,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,diabetes
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"depression mental , depressive disorder"
3,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,"coronary arteriosclerosis ,coronary heart disease"
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pneumonia


In [None]:
#drop 'Unnamed: 0' column from the final_train_df dataframe
final_train_df.drop('Unnamed: 0', axis=1, inplace=True)

#set the prognosis column as the key for the final_train_df dataframe
final_train_df.set_index('prognosis', inplace=True)

#display the first 5 rows of the dataframe
final_train_df.head()

Unnamed: 0_level_0,pain chest,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
prognosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hypertensive disease,0,1,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
diabetes,0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
"depression mental , depressive disorder",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"coronary arteriosclerosis ,coronary heart disease",1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
pneumonia,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#drop null values from the final_train_df dataframe
final_train_df.dropna(inplace=True)
final_train_df.head()

Unnamed: 0_level_0,pain chest,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
prognosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hypertensive disease,0,1,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
diabetes,0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
"depression mental , depressive disorder",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"coronary arteriosclerosis ,coronary heart disease",1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
pneumonia,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
final_train_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 2564 entries, hypertensive disease to decubitus ulcer
Columns: 400 entries, pain chest to homicidal thoughts
dtypes: int64(400)
memory usage: 7.8+ MB


In [None]:
final_train_df.describe()

Unnamed: 0,pain chest,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
count,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,...,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0,2564.0
mean,0.064353,0.124415,0.022621,0.074103,0.026911,0.050312,0.015991,0.040952,0.035101,0.060062,...,0.00351,0.00273,0.00195,0.00312,0.00273,0.00156,0.00195,0.00156,0.00117,0.00234
std,0.245428,0.330119,0.148721,0.261989,0.161855,0.218631,0.125463,0.198217,0.184072,0.237649,...,0.059154,0.052189,0.044125,0.055782,0.052189,0.039475,0.044125,0.039475,0.034193,0.048327
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
final_train_df.shape

(2564, 400)