In [38]:
#Data manipulation and preprocessing 
import pandas as pd
import numpy as np
#Data exploration
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
#Data balancing
from imblearn.over_sampling import RandomOverSampler
#Modelling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

#Model Evaluation
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix,classification_report,accuracy_score,roc_curve
import warnings
warnings.filterwarnings("ignore")

In [39]:
hiv = pd.read_excel('Fever_dataset_copy.xlsx', sheet_name = 'HIV' ) #HIV data
hiv.head(2)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
0,1,57,Female,"Backache, Vomiting, Bitter Taste, Frequent Sto...",HIV
1,2,41,Male,"Frequent Stooling, Bitter Taste, Weight Loss, ...",HIV


In [40]:
tb = pd.read_excel('Fever_dataset_copy.xlsx', sheet_name = 'Tuberculosis') #Tuberculosis data
tb.head(2)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
0,1,34,Male,"Night Sweats, Weight Loss, Fever, Chronic Cough",TB
1,2,42,Male,"Chronic Cough, Night Sweats, Fever, Weight Loss",TB


In [41]:
malaria = pd.read_excel('Fever_dataset_copy.xlsx', sheet_name = 'Malaria') #Malaria dataset
malaria.head(2)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
0,1,33,Female,"Fever, Bitterness of the Mouth, Anorexia, Mala...",Malaria
1,2,66,Male,"Vomiting, Loss of Appetite, Bitterness of the ...",Malaria


In [42]:
herpes = pd.read_excel('Fever_dataset_copy.xlsx', sheet_name = 'Hepatitis B') #Herpatitis B dataset
herpes.head(2)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
0,1,18,Female,"Fever, Headache, Yellowing Skin and Eyes, Yell...",HBV
1,2,59,Male,"Headache, Yellowing Skin and Eyes, Fever, Fatigue",HBV


In [77]:
df = pd.concat([hiv,tb,malaria,herpes], ignore_index = True)
df.tail(5)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
1195,296,29,Male,"Dark Urine, Yellow Skin, Yellowing Skin and Ey...",HBV
1196,297,69,Female,"Headache, Yellow Skin, Yellowing Skin and Eyes...",HBV
1197,298,52,Male,"Fever, Muscle and Joint Pain, Headache, Fatigu...",HBV
1198,299,51,Female,"Muscle and Joint Pain, Yellowing Skin and Eyes...",HBV
1199,300,30,Male,"Yellow Skin, Fever, Fatigue, Yellowing Skin an...",HBV


In [78]:
df.drop(['S/N'],axis = 1,inplace = True)
df.tail(2)

Unnamed: 0,Age,Gender,Symptoms,Diagnosis
1198,51,Female,"Muscle and Joint Pain, Yellowing Skin and Eyes...",HBV
1199,30,Male,"Yellow Skin, Fever, Fatigue, Yellowing Skin an...",HBV


In [79]:
df=df.drop_duplicates(ignore_index = True)
df.head()

Unnamed: 0,Age,Gender,Symptoms,Diagnosis
0,57,Female,"Backache, Vomiting, Bitter Taste, Frequent Sto...",HIV
1,41,Male,"Frequent Stooling, Bitter Taste, Weight Loss, ...",HIV
2,31,Female,"Cough, Rashes, Weight Loss, Vomiting, Fever",HIV
3,36,Female,"Weight Loss, Vomiting, Bitter Taste, Cough",HIV
4,44,Male,"Fever, Weight Loss, Rashes, Bitter Taste",HIV


In [80]:
# Split symptoms into lists
df['Symptoms'] = df['Symptoms'].str.split(',').apply(lambda x: [symptom.strip() for symptom in x])

# Get a set of all unique symptoms
all_symptoms = set(symptom for sublist in df['Symptoms'] for symptom in sublist)

In [81]:
all_symptoms = list(all_symptoms) # Convert the set to a list
len(all_symptoms)

20

In [82]:
df.head()

Unnamed: 0,Age,Gender,Symptoms,Diagnosis
0,57,Female,"[Backache, Vomiting, Bitter Taste, Frequent St...",HIV
1,41,Male,"[Frequent Stooling, Bitter Taste, Weight Loss,...",HIV
2,31,Female,"[Cough, Rashes, Weight Loss, Vomiting, Fever]",HIV
3,36,Female,"[Weight Loss, Vomiting, Bitter Taste, Cough]",HIV
4,44,Male,"[Fever, Weight Loss, Rashes, Bitter Taste]",HIV


In [83]:
#One hot encoding
# Initialize a DataFrame for the binary matrix
binary_df = pd.DataFrame(0, index=df.index, columns=all_symptoms)

# Populate the binary matrix
for idx, symptoms in enumerate(df['Symptoms']):
    binary_df.loc[idx, symptoms] = 1

# Display the binary matrix
binary_df.head()

Unnamed: 0,Yellow Skin,Fever,Bitter Taste,Yellowing Skin and Eyes,Rashes,Vomiting,Bitterness of the Mouth,Muscle and Joint Pain,Backache,Weight Loss,Dark Urine,Frequent Stooling,Cough,Chronic Cough,Malaise,Fatigue,Night Sweats,Loss of Appetite,Anorexia,Headache
0,0,0,1,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0
3,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0
4,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [84]:
binary_df.iloc[3]

Yellow Skin                0
Fever                      0
Bitter Taste               1
Yellowing Skin and Eyes    0
Rashes                     0
Vomiting                   1
Bitterness of the Mouth    0
Muscle and Joint Pain      0
Backache                   0
Weight Loss                1
Dark Urine                 0
Frequent Stooling          0
Cough                      1
Chronic Cough              0
Malaise                    0
Fatigue                    0
Night Sweats               0
Loss of Appetite           0
Anorexia                   0
Headache                   0
Name: 3, dtype: int64

In [85]:
df.head()

Unnamed: 0,Age,Gender,Symptoms,Diagnosis
0,57,Female,"[Backache, Vomiting, Bitter Taste, Frequent St...",HIV
1,41,Male,"[Frequent Stooling, Bitter Taste, Weight Loss,...",HIV
2,31,Female,"[Cough, Rashes, Weight Loss, Vomiting, Fever]",HIV
3,36,Female,"[Weight Loss, Vomiting, Bitter Taste, Cough]",HIV
4,44,Male,"[Fever, Weight Loss, Rashes, Bitter Taste]",HIV


In [86]:
data = pd.concat([df.drop('Symptoms',axis = 1),binary_df],axis = 1)
data.tail(5)

Unnamed: 0,Age,Gender,Diagnosis,Yellow Skin,Fever,Bitter Taste,Yellowing Skin and Eyes,Rashes,Vomiting,Bitterness of the Mouth,...,Dark Urine,Frequent Stooling,Cough,Chronic Cough,Malaise,Fatigue,Night Sweats,Loss of Appetite,Anorexia,Headache
1185,29,Male,HBV,1,1,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1186,69,Female,HBV,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1187,52,Male,HBV,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1188,51,Female,HBV,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1189,30,Male,HBV,1,1,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [87]:
data['Gender'] = data['Gender'].replace({'Male':0,'Female':1})
data.tail(5)

Unnamed: 0,Age,Gender,Diagnosis,Yellow Skin,Fever,Bitter Taste,Yellowing Skin and Eyes,Rashes,Vomiting,Bitterness of the Mouth,...,Dark Urine,Frequent Stooling,Cough,Chronic Cough,Malaise,Fatigue,Night Sweats,Loss of Appetite,Anorexia,Headache
1185,29,0,HBV,1,1,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1186,69,1,HBV,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1187,52,0,HBV,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1188,51,1,HBV,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1189,30,0,HBV,1,1,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [88]:
x = data.drop('Diagnosis',axis = 1)
y = data['Diagnosis']

In [89]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 40)
print(len(x_train))
print(len(x_test))

952
238


In [90]:
x_train.head()

Unnamed: 0,Age,Gender,Yellow Skin,Fever,Bitter Taste,Yellowing Skin and Eyes,Rashes,Vomiting,Bitterness of the Mouth,Muscle and Joint Pain,...,Dark Urine,Frequent Stooling,Cough,Chronic Cough,Malaise,Fatigue,Night Sweats,Loss of Appetite,Anorexia,Headache
503,18,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
262,26,0,0,1,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
526,25,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
512,70,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
31,21,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [91]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
print(classification_report(y_test,knn_pred))

              precision    recall  f1-score   support

         HBV       1.00      1.00      1.00        64
         HIV       1.00      1.00      1.00        54
     Malaria       1.00      1.00      1.00        66
          TB       1.00      1.00      1.00        54

    accuracy                           1.00       238
   macro avg       1.00      1.00      1.00       238
weighted avg       1.00      1.00      1.00       238



In [25]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_pred = rfc.predict(x_test)
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

         HBV       1.00      1.00      1.00        54
         HIV       1.00      1.00      1.00        59
     Malaria       1.00      1.00      1.00        72
          TB       1.00      1.00      1.00        55

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240



In [26]:
data.columns

Index(['S/N', 'Age', 'Gender', 'Diagnosis', 'Yellow Skin', 'Fever',
       'Bitter Taste', 'Yellowing Skin and Eyes', 'Rashes', 'Vomiting',
       'Bitterness of the Mouth', 'Muscle and Joint Pain', 'Backache',
       'Weight Loss', 'Dark Urine', 'Frequent Stooling', 'Cough',
       'Chronic Cough', 'Malaise', 'Fatigue', 'Night Sweats',
       'Loss of Appetite', 'Anorexia', 'Headache'],
      dtype='object')

In [27]:
cat_features = ['Gender', 'Vomiting', 'Yellow Skin', 'Rashes',
       'Bitterness of the Mouth', 'Bitter Taste', 'Loss of Appetite',
       'Frequent Stooling', 'Fever', 'Weight Loss', 'Backache', 'Night Sweats',
       'Dark Urine', 'Muscle and Joint Pain', 'Headache', 'Chronic Cough',
       'Malaise', 'Fatigue', 'Cough', 'Anorexia', 'Yellowing Skin and Eyes']

In [28]:
cat= CatBoostClassifier(iterations=500, 
                           depth=6, 
                           learning_rate=0.1, 
                           loss_function='MultiClass', 
                           verbose=True)
# Training the model
cat.fit(x_train, y_train, cat_features=cat_features, eval_set=(x_test, y_test), plot=True)
# Making predictions
cat_pred = cat.predict(x_test)
print(classification_report(y_test,cat_pred))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.1542944	test: 1.1585680	best: 1.1585680 (0)	total: 186ms	remaining: 1m 32s
1:	learn: 0.9851983	test: 0.9978213	best: 0.9978213 (1)	total: 204ms	remaining: 50.9s
2:	learn: 0.8568488	test: 0.8703318	best: 0.8703318 (2)	total: 221ms	remaining: 36.7s
3:	learn: 0.7536996	test: 0.7671739	best: 0.7671739 (3)	total: 239ms	remaining: 29.6s
4:	learn: 0.6671400	test: 0.6831988	best: 0.6831988 (4)	total: 255ms	remaining: 25.3s
5:	learn: 0.5938718	test: 0.6107682	best: 0.6107682 (5)	total: 272ms	remaining: 22.4s
6:	learn: 0.5275975	test: 0.5434960	best: 0.5434960 (6)	total: 290ms	remaining: 20.4s
7:	learn: 0.4737735	test: 0.4906508	best: 0.4906508 (7)	total: 307ms	remaining: 18.9s
8:	learn: 0.4293652	test: 0.4444935	best: 0.4444935 (8)	total: 325ms	remaining: 17.7s
9:	learn: 0.3925621	test: 0.4078644	best: 0.4078644 (9)	total: 342ms	remaining: 16.7s
10:	learn: 0.3540733	test: 0.3694682	best: 0.3694682 (10)	total: 358ms	remaining: 15.9s
11:	learn: 0.3206635	test: 0.3352634	best: 0.335263

In [29]:
scores = cross_val_score(cat, x, y, cv=5)
print(f"Accuracy Scores for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

0:	learn: 1.1791953	total: 11ms	remaining: 5.48s
1:	learn: 1.0066053	total: 29.1ms	remaining: 7.24s
2:	learn: 0.8714423	total: 48.4ms	remaining: 8.01s
3:	learn: 0.7641258	total: 218ms	remaining: 27s
4:	learn: 0.6751066	total: 322ms	remaining: 31.8s
5:	learn: 0.6059898	total: 359ms	remaining: 29.6s
6:	learn: 0.5486764	total: 400ms	remaining: 28.1s
7:	learn: 0.4922186	total: 438ms	remaining: 26.9s
8:	learn: 0.4443795	total: 477ms	remaining: 26s
9:	learn: 0.4018042	total: 549ms	remaining: 26.9s
10:	learn: 0.3642937	total: 580ms	remaining: 25.8s
11:	learn: 0.3293061	total: 611ms	remaining: 24.8s
12:	learn: 0.2988943	total: 635ms	remaining: 23.8s
13:	learn: 0.2753178	total: 660ms	remaining: 22.9s
14:	learn: 0.2519833	total: 682ms	remaining: 22.1s
15:	learn: 0.2291850	total: 703ms	remaining: 21.3s
16:	learn: 0.2087528	total: 724ms	remaining: 20.6s
17:	learn: 0.1924544	total: 752ms	remaining: 20.1s
18:	learn: 0.1771381	total: 777ms	remaining: 19.7s
19:	learn: 0.1635575	total: 799ms	remaining:

In [30]:
scores = cross_val_score(rfc, x, y, cv=5)
print(f"Accuracy Scores for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

Accuracy Scores for each fold: [1. 1. 1. 1. 1.]
Mean Accuracy: 1.0


In [31]:
scores = cross_val_score(knn, x, y, cv=5)
print(f"Accuracy Scores for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

Accuracy Scores for each fold: [0.275      0.25       0.27916667 0.24583333 0.24583333]
Mean Accuracy: 0.25916666666666666
