In [140]:
#Data manipulation and preprocessing 
import pandas as pd
import numpy as np
#Data exploration
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
#Data balancing
from imblearn.over_sampling import RandomOverSampler
#Modelling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

#Model Evaluation
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix,classification_report,accuracy_score,roc_curve
import warnings
warnings.filterwarnings("ignore")

In [62]:
hiv = pd.read_excel('Fever_dataset.xlsx', sheet_name = 'HIV' ) #HIV data
hiv.head(2)

Unnamed: 0,S/N,Name,Age,Gender,Symptoms,Diagnosis
0,1,Amaka Garuba,57,Female,"Backache, Vomiting, Bitter Taste, Frequent Sto...",HIV
1,2,Ezekiel Okeke,41,Male,"Frequent Stooling, Bitter Taste, Weight Loss, ...",HIV


In [63]:
tb = pd.read_excel('Fever_dataset.xlsx', sheet_name = 'Tuberculosis') #Tuberculosis data
tb.head(2)

Unnamed: 0,S/N,Name,Age,Gender,Symptoms,Diagnosis
0,1,Chinonso Ogunleye,34,Male,"Night Sweats, Weight Loss, Fever, Chronic Cough",TB
1,2,Chioma Balogun,42,Male,"Chronic Cough, Night Sweats, Fever, Weight Loss",TB


In [64]:
malaria = pd.read_excel('Fever_dataset.xlsx', sheet_name = 'Malaria') #Malaria dataset
malaria.head(2)

Unnamed: 0,S/N,Name,Age,Gender,Symptoms,Diagnosis
0,1,Chioma Garuba,33,Female,"Fever, Bitterness of the Mouth, Anorexia, Mala...",Malaria
1,2,Adebayo Mohammed,66,Male,"Vomiting, Loss of Appetite, Bitterness of the ...",Malaria


In [65]:
herpes = pd.read_excel('Fever_dataset.xlsx', sheet_name = 'Hepatitis B') #Herpatitis B dataset
herpes.head(2)

Unnamed: 0,S/N,Name,Age,Gender,Symptoms,Diagnosis
0,1,Olamide Obi,18,Female,"Fever, Headache, Yellowing Skin and Eyes, Yell...",HBV
1,2,Ifeanyi Ogunleye,59,Male,"Headache, Yellowing Skin and Eyes, Fever, Fatigue",HBV


In [105]:
df = pd.concat([hiv,tb,malaria,herpes], ignore_index = True)
df.tail()

Unnamed: 0,S/N,Name,Age,Gender,Symptoms,Diagnosis
1195,296,Ibrahim Onyeka,29,Male,"Dark Urine, Yellow Skin, Yellowing Skin and Ey...",HBV
1196,297,Adesuwa Ogunleye,69,Female,"Headache, Yellow Skin, Yellowing Skin and Eyes...",HBV
1197,298,Oluwatosin Balogun,52,Male,"Fever, Muscle and Joint Pain, Headache, Fatigu...",HBV
1198,299,Blessing Ibrahim,51,Female,"Muscle and Joint Pain, Yellowing Skin and Eyes...",HBV
1199,300,Kelechi Akintola,30,Male,"Yellow Skin, Fever, Fatigue, Yellowing Skin an...",HBV


In [106]:
df.drop(['S/N','Name'],axis = 1,inplace = True)
df.tail()

Unnamed: 0,Age,Gender,Symptoms,Diagnosis
1195,29,Male,"Dark Urine, Yellow Skin, Yellowing Skin and Ey...",HBV
1196,69,Female,"Headache, Yellow Skin, Yellowing Skin and Eyes...",HBV
1197,52,Male,"Fever, Muscle and Joint Pain, Headache, Fatigu...",HBV
1198,51,Female,"Muscle and Joint Pain, Yellowing Skin and Eyes...",HBV
1199,30,Male,"Yellow Skin, Fever, Fatigue, Yellowing Skin an...",HBV


In [107]:
# Split symptoms into lists
df['Symptoms'] = df['Symptoms'].str.split(',').apply(lambda x: [symptom.strip() for symptom in x])

# Get a set of all unique symptoms
all_symptoms = set(symptom for sublist in df['Symptoms'] for symptom in sublist)

In [108]:
all_symptoms = list(all_symptoms) # Convert the set to a list
len(all_symptoms)

20

In [109]:
df.head()

Unnamed: 0,Age,Gender,Symptoms,Diagnosis
0,57,Female,"[Backache, Vomiting, Bitter Taste, Frequent St...",HIV
1,41,Male,"[Frequent Stooling, Bitter Taste, Weight Loss,...",HIV
2,31,Female,"[Cough, Rashes, Weight Loss, Vomiting, Fever]",HIV
3,36,Female,"[Weight Loss, Vomiting, Bitter Taste, Cough]",HIV
4,44,Male,"[Fever, Weight Loss, Rashes, Bitter Taste]",HIV


In [110]:
# Initialize a DataFrame for the binary matrix
binary_df = pd.DataFrame(0, index=df.index, columns=all_symptoms)

# Populate the binary matrix
for idx, symptoms in enumerate(df['Symptoms']):
    binary_df.loc[idx, symptoms] = 1

# Display the binary matrix
binary_df.head()

Unnamed: 0,Vomiting,Yellow Skin,Rashes,Bitterness of the Mouth,Bitter Taste,Loss of Appetite,Frequent Stooling,Fever,Weight Loss,Backache,Night Sweats,Dark Urine,Muscle and Joint Pain,Headache,Chronic Cough,Malaise,Fatigue,Cough,Anorexia,Yellowing Skin and Eyes
0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0


In [112]:
binary_df.iloc[3]

Vomiting                   1
Yellow Skin                0
Rashes                     0
Bitterness of the Mouth    0
Bitter Taste               1
Loss of Appetite           0
Frequent Stooling          0
Fever                      0
Weight Loss                1
Backache                   0
Night Sweats               0
Dark Urine                 0
Muscle and Joint Pain      0
Headache                   0
Chronic Cough              0
Malaise                    0
Fatigue                    0
Cough                      1
Anorexia                   0
Yellowing Skin and Eyes    0
Name: 3, dtype: int64

In [113]:
data = pd.concat([df.drop('Symptoms',axis = 1),binary_df],axis = 1)
data.tail(3)

Unnamed: 0,Age,Gender,Diagnosis,Vomiting,Yellow Skin,Rashes,Bitterness of the Mouth,Bitter Taste,Loss of Appetite,Frequent Stooling,...,Night Sweats,Dark Urine,Muscle and Joint Pain,Headache,Chronic Cough,Malaise,Fatigue,Cough,Anorexia,Yellowing Skin and Eyes
1197,52,Male,HBV,0,0,0,0,0,0,0,...,0,0,1,1,0,0,1,0,0,1
1198,51,Female,HBV,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
1199,30,Male,HBV,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1


In [116]:
data['Gender'] = data['Gender'].replace({'Male':0,'Female':1})
data.tail(3)

Unnamed: 0,Age,Gender,Diagnosis,Vomiting,Yellow Skin,Rashes,Bitterness of the Mouth,Bitter Taste,Loss of Appetite,Frequent Stooling,...,Night Sweats,Dark Urine,Muscle and Joint Pain,Headache,Chronic Cough,Malaise,Fatigue,Cough,Anorexia,Yellowing Skin and Eyes
1197,52,0,HBV,0,0,0,0,0,0,0,...,0,0,1,1,0,0,1,0,0,1
1198,51,1,HBV,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
1199,30,0,HBV,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1


In [131]:
data.duplicated().sum()
data = data.drop_duplicates()

In [117]:
data.columns

Index(['Age', 'Gender', 'Diagnosis', 'Vomiting', 'Yellow Skin', 'Rashes',
       'Bitterness of the Mouth', 'Bitter Taste', 'Loss of Appetite',
       'Frequent Stooling', 'Fever', 'Weight Loss', 'Backache', 'Night Sweats',
       'Dark Urine', 'Muscle and Joint Pain', 'Headache', 'Chronic Cough',
       'Malaise', 'Fatigue', 'Cough', 'Anorexia', 'Yellowing Skin and Eyes'],
      dtype='object')

In [132]:
x = data.drop('Diagnosis',axis = 1)
y = data['Diagnosis']

In [133]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 40)
print(len(x_train))
print(len(x_test))

853
214


In [134]:
x_train.head()

Unnamed: 0,Age,Gender,Vomiting,Yellow Skin,Rashes,Bitterness of the Mouth,Bitter Taste,Loss of Appetite,Frequent Stooling,Fever,...,Night Sweats,Dark Urine,Muscle and Joint Pain,Headache,Chronic Cough,Malaise,Fatigue,Cough,Anorexia,Yellowing Skin and Eyes
1072,64,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
701,29,0,0,0,0,1,0,1,0,1,...,0,0,0,0,0,1,0,0,1,0
43,54,0,0,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,1,0,0
911,54,1,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,0,1
380,49,0,0,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0


In [135]:
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
rfc_pred = rfc.predict(x_test)
print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

         HBV       1.00      1.00      1.00        47
         HIV       1.00      1.00      1.00        60
     Malaria       1.00      1.00      1.00        65
          TB       1.00      1.00      1.00        42

    accuracy                           1.00       214
   macro avg       1.00      1.00      1.00       214
weighted avg       1.00      1.00      1.00       214



In [136]:
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
knn_pred = knn.predict(x_test)
print(classification_report(y_test, knn_pred))

              precision    recall  f1-score   support

         HBV       1.00      1.00      1.00        47
         HIV       1.00      0.98      0.99        60
     Malaria       1.00      1.00      1.00        65
          TB       0.98      1.00      0.99        42

    accuracy                           1.00       214
   macro avg       0.99      1.00      0.99       214
weighted avg       1.00      1.00      1.00       214



In [137]:
data.columns

Index(['Age', 'Gender', 'Diagnosis', 'Vomiting', 'Yellow Skin', 'Rashes',
       'Bitterness of the Mouth', 'Bitter Taste', 'Loss of Appetite',
       'Frequent Stooling', 'Fever', 'Weight Loss', 'Backache', 'Night Sweats',
       'Dark Urine', 'Muscle and Joint Pain', 'Headache', 'Chronic Cough',
       'Malaise', 'Fatigue', 'Cough', 'Anorexia', 'Yellowing Skin and Eyes'],
      dtype='object')

In [138]:
cat_features = ['Gender', 'Vomiting', 'Yellow Skin', 'Rashes',
       'Bitterness of the Mouth', 'Bitter Taste', 'Loss of Appetite',
       'Frequent Stooling', 'Fever', 'Weight Loss', 'Backache', 'Night Sweats',
       'Dark Urine', 'Muscle and Joint Pain', 'Headache', 'Chronic Cough',
       'Malaise', 'Fatigue', 'Cough', 'Anorexia', 'Yellowing Skin and Eyes']

In [139]:
cat= CatBoostClassifier(iterations=500, 
                           depth=6, 
                           learning_rate=0.1, 
                           loss_function='MultiClass', 
                           verbose=True)
# Training the model
cat.fit(x_train, y_train, cat_features=cat_features, eval_set=(x_test, y_test), plot=True)
# Making predictions
cat_pred = cat.predict(x_test)
print(classification_report(y_test,cat_pred))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.1683985	test: 1.1675931	best: 1.1675931 (0)	total: 27.4ms	remaining: 13.7s
1:	learn: 1.0126088	test: 1.0200041	best: 1.0200041 (1)	total: 55.3ms	remaining: 13.8s
2:	learn: 0.8801469	test: 0.8876848	best: 0.8876848 (2)	total: 75.4ms	remaining: 12.5s
3:	learn: 0.7721161	test: 0.7816034	best: 0.7816034 (3)	total: 106ms	remaining: 13.2s
4:	learn: 0.6883264	test: 0.7006454	best: 0.7006454 (4)	total: 131ms	remaining: 13s
5:	learn: 0.6275166	test: 0.6333446	best: 0.6333446 (5)	total: 154ms	remaining: 12.7s
6:	learn: 0.5634694	test: 0.5685714	best: 0.5685714 (6)	total: 176ms	remaining: 12.4s
7:	learn: 0.5061571	test: 0.5172878	best: 0.5172878 (7)	total: 204ms	remaining: 12.5s
8:	learn: 0.4576704	test: 0.4686439	best: 0.4686439 (8)	total: 232ms	remaining: 12.6s
9:	learn: 0.4125988	test: 0.4238888	best: 0.4238888 (9)	total: 255ms	remaining: 12.5s
10:	learn: 0.3768463	test: 0.3877921	best: 0.3877921 (10)	total: 282ms	remaining: 12.5s
11:	learn: 0.3425298	test: 0.3534172	best: 0.353417

In [142]:
scores = cross_val_score(cat, x, y, cv=5)
print(f"Accuracy Scores for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

0:	learn: 1.1674954	total: 18.1ms	remaining: 9.06s
1:	learn: 0.9904737	total: 38.7ms	remaining: 9.64s
2:	learn: 0.8811066	total: 56.5ms	remaining: 9.37s
3:	learn: 0.7770937	total: 74.3ms	remaining: 9.21s
4:	learn: 0.6948679	total: 92.3ms	remaining: 9.13s
5:	learn: 0.6168967	total: 110ms	remaining: 9.06s
6:	learn: 0.5532004	total: 128ms	remaining: 9.04s
7:	learn: 0.4989581	total: 149ms	remaining: 9.13s
8:	learn: 0.4503659	total: 171ms	remaining: 9.34s
9:	learn: 0.4096810	total: 188ms	remaining: 9.22s
10:	learn: 0.3729532	total: 203ms	remaining: 9.04s
11:	learn: 0.3393797	total: 218ms	remaining: 8.86s
12:	learn: 0.3097748	total: 239ms	remaining: 8.94s
13:	learn: 0.2850203	total: 258ms	remaining: 8.95s
14:	learn: 0.2586518	total: 277ms	remaining: 8.96s
15:	learn: 0.2360299	total: 293ms	remaining: 8.87s
16:	learn: 0.2201579	total: 303ms	remaining: 8.61s
17:	learn: 0.2025384	total: 317ms	remaining: 8.49s
18:	learn: 0.1874545	total: 333ms	remaining: 8.44s
19:	learn: 0.1748946	total: 348ms	re

In [145]:
scores = cross_val_score(rfc, x, y, cv=5)
print(f"Accuracy Scores for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

Accuracy Scores for each fold: [1. 1. 1. 1. 1.]
Mean Accuracy: 1.0


In [146]:
scores = cross_val_score(knn, x, y, cv=5)
print(f"Accuracy Scores for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

Accuracy Scores for each fold: [0.9953271  0.9953271  1.         0.99530516 0.99061033]
Mean Accuracy: 0.9953139397130448
