In [2]:
#Data manipulation and preprocessing 
import pandas as pd
import numpy as np
#Data exploration
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
#Data balancing
from imblearn.over_sampling import RandomOverSampler
#Modelling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

#Model Evaluation
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix,classification_report,accuracy_score,roc_curve
import warnings
warnings.filterwarnings("ignore")

In [3]:
hiv = pd.read_excel('Fever_dataset_copy.xlsx', sheet_name = 'HIV' ) #HIV data
hiv.head(2)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
0,1,57,Female,"Backache, Vomiting, Bitter Taste, Frequent Sto...",HIV
1,2,41,Male,"Frequent Stooling, Bitter Taste, Weight Loss, ...",HIV


In [4]:
tb = pd.read_excel('Fever_dataset_copy.xlsx', sheet_name = 'Tuberculosis') #Tuberculosis data
tb.head(2)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
0,1,34,Male,"Night Sweats, Weight Loss, Fever, Chronic Cough",TB
1,2,42,Male,"Chronic Cough, Night Sweats, Fever, Weight Loss",TB


In [5]:
malaria = pd.read_excel('Fever_dataset_copy.xlsx', sheet_name = 'Malaria') #Malaria dataset
malaria.head(2)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
0,1,33,Female,"Fever, Bitterness of the Mouth, Anorexia, Mala...",Malaria
1,2,66,Male,"Vomiting, Loss of Appetite, Bitterness of the ...",Malaria


In [6]:
herpes = pd.read_excel('Fever_dataset_copy.xlsx', sheet_name = 'Hepatitis B') #Herpatitis B dataset
herpes.head(2)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
0,1,18,Female,"Fever, Headache, Yellowing Skin and Eyes, Yell...",HBV
1,2,59,Male,"Headache, Yellowing Skin and Eyes, Fever, Fatigue",HBV


In [7]:
df = pd.concat([hiv,tb,malaria,herpes], ignore_index = True)
df.tail(5)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
1195,296,29,Male,"Dark Urine, Yellow Skin, Yellowing Skin and Ey...",HBV
1196,297,69,Female,"Headache, Yellow Skin, Yellowing Skin and Eyes...",HBV
1197,298,52,Male,"Fever, Muscle and Joint Pain, Headache, Fatigu...",HBV
1198,299,51,Female,"Muscle and Joint Pain, Yellowing Skin and Eyes...",HBV
1199,300,30,Male,"Yellow Skin, Fever, Fatigue, Yellowing Skin an...",HBV


In [8]:
df.drop(['S/N'],axis = 1,inplace = True)
df.tail(2)

Unnamed: 0,Age,Gender,Symptoms,Diagnosis
1198,51,Female,"Muscle and Joint Pain, Yellowing Skin and Eyes...",HBV
1199,30,Male,"Yellow Skin, Fever, Fatigue, Yellowing Skin an...",HBV


In [9]:
#df=df.drop_duplicates(ignore_index = True)
#df.head()

In [10]:
# Split symptoms into lists
df['Symptoms'] = df['Symptoms'].str.split(',').apply(lambda x: [symptom.strip() for symptom in x])

# Get a set of all unique symptoms
all_symptoms = set(symptom for sublist in df['Symptoms'] for symptom in sublist)

In [11]:
all_symptoms = list(all_symptoms) # Convert the set to a list
len(all_symptoms)

20

In [12]:
df.head()

Unnamed: 0,Age,Gender,Symptoms,Diagnosis
0,57,Female,"[Backache, Vomiting, Bitter Taste, Frequent St...",HIV
1,41,Male,"[Frequent Stooling, Bitter Taste, Weight Loss,...",HIV
2,31,Female,"[Cough, Rashes, Weight Loss, Vomiting, Fever]",HIV
3,36,Female,"[Weight Loss, Vomiting, Bitter Taste, Cough]",HIV
4,44,Male,"[Fever, Weight Loss, Rashes, Bitter Taste]",HIV


In [13]:
#One hot encoding
# Initialize a DataFrame for the binary matrix
binary_df = pd.DataFrame(0, index=df.index, columns=all_symptoms)

# Populate the binary matrix
for idx, symptoms in enumerate(df['Symptoms']):
    binary_df.loc[idx, symptoms] = 1

# Display the binary matrix
binary_df.head()

Unnamed: 0,Frequent Stooling,Cough,Fatigue,Backache,Muscle and Joint Pain,Weight Loss,Yellowing Skin and Eyes,Night Sweats,Malaise,Chronic Cough,Headache,Bitter Taste,Yellow Skin,Vomiting,Dark Urine,Bitterness of the Mouth,Rashes,Loss of Appetite,Fever,Anorexia
0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0
1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0
2,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0
3,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0


In [14]:
binary_df.iloc[3]

Frequent Stooling          0
Cough                      1
Fatigue                    0
Backache                   0
Muscle and Joint Pain      0
Weight Loss                1
Yellowing Skin and Eyes    0
Night Sweats               0
Malaise                    0
Chronic Cough              0
Headache                   0
Bitter Taste               1
Yellow Skin                0
Vomiting                   1
Dark Urine                 0
Bitterness of the Mouth    0
Rashes                     0
Loss of Appetite           0
Fever                      0
Anorexia                   0
Name: 3, dtype: int64

In [15]:
df.head()

Unnamed: 0,Age,Gender,Symptoms,Diagnosis
0,57,Female,"[Backache, Vomiting, Bitter Taste, Frequent St...",HIV
1,41,Male,"[Frequent Stooling, Bitter Taste, Weight Loss,...",HIV
2,31,Female,"[Cough, Rashes, Weight Loss, Vomiting, Fever]",HIV
3,36,Female,"[Weight Loss, Vomiting, Bitter Taste, Cough]",HIV
4,44,Male,"[Fever, Weight Loss, Rashes, Bitter Taste]",HIV


In [16]:
data = pd.concat([df.drop('Symptoms',axis = 1),binary_df],axis = 1)
data.tail(5)

Unnamed: 0,Age,Gender,Diagnosis,Frequent Stooling,Cough,Fatigue,Backache,Muscle and Joint Pain,Weight Loss,Yellowing Skin and Eyes,...,Headache,Bitter Taste,Yellow Skin,Vomiting,Dark Urine,Bitterness of the Mouth,Rashes,Loss of Appetite,Fever,Anorexia
1195,29,Male,HBV,0,0,1,0,0,0,1,...,0,0,1,0,1,0,0,0,1,0
1196,69,Female,HBV,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,1,0
1197,52,Male,HBV,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,1,0
1198,51,Female,HBV,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,1,0
1199,30,Male,HBV,0,0,1,0,0,0,1,...,0,0,1,0,1,0,0,0,1,0


In [17]:
data['Gender'] = data['Gender'].replace({'Male':0,'Female':1})
data.tail(5)

Unnamed: 0,Age,Gender,Diagnosis,Frequent Stooling,Cough,Fatigue,Backache,Muscle and Joint Pain,Weight Loss,Yellowing Skin and Eyes,...,Headache,Bitter Taste,Yellow Skin,Vomiting,Dark Urine,Bitterness of the Mouth,Rashes,Loss of Appetite,Fever,Anorexia
1195,29,0,HBV,0,0,1,0,0,0,1,...,0,0,1,0,1,0,0,0,1,0
1196,69,1,HBV,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,1,0
1197,52,0,HBV,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,1,0
1198,51,1,HBV,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,1,0
1199,30,0,HBV,0,0,1,0,0,0,1,...,0,0,1,0,1,0,0,0,1,0


In [18]:
x = data.drop('Diagnosis',axis = 1)
y = data['Diagnosis']

In [19]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 40)
print(len(x_train))
print(len(x_test))

960
240


In [20]:
x_train.head()

Unnamed: 0,Age,Gender,Frequent Stooling,Cough,Fatigue,Backache,Muscle and Joint Pain,Weight Loss,Yellowing Skin and Eyes,Night Sweats,...,Headache,Bitter Taste,Yellow Skin,Vomiting,Dark Urine,Bitterness of the Mouth,Rashes,Loss of Appetite,Fever,Anorexia
866,61,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,1
652,25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,0
975,38,0,0,0,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,1,0
318,56,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
31,21,0,0,1,0,1,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0


In [22]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
print(classification_report(y_test,knn_pred))

              precision    recall  f1-score   support

         HBV       1.00      1.00      1.00        54
         HIV       1.00      1.00      1.00        59
     Malaria       1.00      1.00      1.00        72
          TB       1.00      1.00      1.00        55

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240



In [23]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_pred = rfc.predict(x_test)
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

         HBV       1.00      1.00      1.00        54
         HIV       1.00      1.00      1.00        59
     Malaria       1.00      1.00      1.00        72
          TB       1.00      1.00      1.00        55

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240



In [24]:
data.columns

Index(['Age', 'Gender', 'Diagnosis', 'Frequent Stooling', 'Cough', 'Fatigue',
       'Backache', 'Muscle and Joint Pain', 'Weight Loss',
       'Yellowing Skin and Eyes', 'Night Sweats', 'Malaise', 'Chronic Cough',
       'Headache', 'Bitter Taste', 'Yellow Skin', 'Vomiting', 'Dark Urine',
       'Bitterness of the Mouth', 'Rashes', 'Loss of Appetite', 'Fever',
       'Anorexia'],
      dtype='object')

In [25]:
cat_features = ['Gender', 'Vomiting', 'Yellow Skin', 'Rashes',
       'Bitterness of the Mouth', 'Bitter Taste', 'Loss of Appetite',
       'Frequent Stooling', 'Fever', 'Weight Loss', 'Backache', 'Night Sweats',
       'Dark Urine', 'Muscle and Joint Pain', 'Headache', 'Chronic Cough',
       'Malaise', 'Fatigue', 'Cough', 'Anorexia', 'Yellowing Skin and Eyes']

In [27]:
cat= CatBoostClassifier(iterations=500, 
                           depth=6, 
                           learning_rate=0.1, 
                           loss_function='MultiClass', 
                           verbose=True)
# Training the model
cat.fit(x_train, y_train, cat_features=cat_features, eval_set=(x_test, y_test), plot=True)
# Making predictions
cat_pred = cat.predict(x_test)
print(classification_report(y_test,cat_pred))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.1574023	test: 1.1640663	best: 1.1640663 (0)	total: 16.5ms	remaining: 8.22s
1:	learn: 0.9860052	test: 0.9955249	best: 0.9955249 (1)	total: 28.6ms	remaining: 7.11s
2:	learn: 0.8589021	test: 0.8723254	best: 0.8723254 (2)	total: 35.7ms	remaining: 5.92s
3:	learn: 0.7673267	test: 0.7860056	best: 0.7860056 (3)	total: 43.8ms	remaining: 5.43s
4:	learn: 0.6751066	test: 0.6938083	best: 0.6938083 (4)	total: 53.4ms	remaining: 5.28s
5:	learn: 0.5962683	test: 0.6152296	best: 0.6152296 (5)	total: 61.9ms	remaining: 5.09s
6:	learn: 0.5383453	test: 0.5591777	best: 0.5591777 (6)	total: 68.7ms	remaining: 4.84s
7:	learn: 0.4837711	test: 0.5030666	best: 0.5030666 (7)	total: 74.5ms	remaining: 4.58s
8:	learn: 0.4388053	test: 0.4564021	best: 0.4564021 (8)	total: 81.4ms	remaining: 4.44s
9:	learn: 0.3953558	test: 0.4108000	best: 0.4108000 (9)	total: 91.2ms	remaining: 4.47s
10:	learn: 0.3579092	test: 0.3718834	best: 0.3718834 (10)	total: 97.9ms	remaining: 4.35s
11:	learn: 0.3250519	test: 0.3383328	best

In [28]:
scores = cross_val_score(cat, x, y, cv=5)
print(f"Accuracy Scores for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

0:	learn: 1.1780697	total: 5.82ms	remaining: 2.9s
1:	learn: 1.0061782	total: 14.8ms	remaining: 3.69s
2:	learn: 0.8863481	total: 21.9ms	remaining: 3.63s
3:	learn: 0.7887559	total: 27.6ms	remaining: 3.42s
4:	learn: 0.6954874	total: 35.4ms	remaining: 3.5s
5:	learn: 0.6127261	total: 43.7ms	remaining: 3.6s
6:	learn: 0.5544383	total: 52.3ms	remaining: 3.68s
7:	learn: 0.4992361	total: 62ms	remaining: 3.81s
8:	learn: 0.4517530	total: 70.5ms	remaining: 3.84s
9:	learn: 0.4092681	total: 78.5ms	remaining: 3.85s
10:	learn: 0.3707855	total: 85.3ms	remaining: 3.79s
11:	learn: 0.3364043	total: 93.6ms	remaining: 3.8s
12:	learn: 0.3062223	total: 102ms	remaining: 3.83s
13:	learn: 0.2798961	total: 111ms	remaining: 3.86s
14:	learn: 0.2561687	total: 118ms	remaining: 3.8s
15:	learn: 0.2339450	total: 129ms	remaining: 3.9s
16:	learn: 0.2150943	total: 137ms	remaining: 3.9s
17:	learn: 0.1976701	total: 149ms	remaining: 3.98s
18:	learn: 0.1806930	total: 158ms	remaining: 4.01s
19:	learn: 0.1670881	total: 167ms	rema

In [29]:
scores = cross_val_score(rfc, x, y, cv=5)
print(f"Accuracy Scores for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

Accuracy Scores for each fold: [1. 1. 1. 1. 1.]
Mean Accuracy: 1.0


In [30]:
scores = cross_val_score(knn, x, y, cv=5)
print(f"Accuracy Scores for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

Accuracy Scores for each fold: [1.         0.99583333 1.         0.99583333 0.9875    ]
Mean Accuracy: 0.9958333333333333
