In [1]:
#Data manipulation and preprocessing 
import pandas as pd
import numpy as np
#Data exploration
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
#Data balancing
from imblearn.over_sampling import RandomOverSampler
#Modelling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

#Model Evaluation
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix,classification_report,accuracy_score,roc_curve
import warnings
warnings.filterwarnings("ignore")

In [2]:
hiv = pd.read_excel('Fever_dataset_copy.xlsx', sheet_name = 'HIV' ) #HIV data
hiv.head(2)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
0,1,57,Female,"Backache, Vomiting, Bitter Taste, Frequent Sto...",HIV
1,2,41,Male,"Frequent Stooling, Bitter Taste, Weight Loss, ...",HIV


In [3]:
tb = pd.read_excel('Fever_dataset_copy.xlsx', sheet_name = 'Tuberculosis') #Tuberculosis data
tb.head(2)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
0,1,34,Male,"Night Sweats, Weight Loss, Fever, Chronic Cough",TB
1,2,42,Male,"Chronic Cough, Night Sweats, Fever, Weight Loss",TB


In [4]:
malaria = pd.read_excel('Fever_dataset_copy.xlsx', sheet_name = 'Malaria') #Malaria dataset
malaria.head(2)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
0,1,33,Female,"Fever, Bitterness of the Mouth, Anorexia, Mala...",Malaria
1,2,66,Male,"Vomiting, Loss of Appetite, Bitterness of the ...",Malaria


In [5]:
herpes = pd.read_excel('Fever_dataset_copy.xlsx', sheet_name = 'Hepatitis B') #Herpatitis B dataset
herpes.head(2)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
0,1,18,Female,"Fever, Headache, Yellowing Skin and Eyes, Yell...",HBV
1,2,59,Male,"Headache, Yellowing Skin and Eyes, Fever, Fatigue",HBV


In [6]:
df = pd.concat([hiv,tb,malaria,herpes], ignore_index = True)
df.tail(5)

Unnamed: 0,S/N,Age,Gender,Symptoms,Diagnosis
1195,296,29,Male,"Dark Urine, Yellow Skin, Yellowing Skin and Ey...",HBV
1196,297,69,Female,"Headache, Yellow Skin, Yellowing Skin and Eyes...",HBV
1197,298,52,Male,"Fever, Muscle and Joint Pain, Headache, Fatigu...",HBV
1198,299,51,Female,"Muscle and Joint Pain, Yellowing Skin and Eyes...",HBV
1199,300,30,Male,"Yellow Skin, Fever, Fatigue, Yellowing Skin an...",HBV


In [7]:
df.drop(['S/N'],axis = 1,inplace = True)
df.tail(2)

Unnamed: 0,Age,Gender,Symptoms,Diagnosis
1198,51,Female,"Muscle and Joint Pain, Yellowing Skin and Eyes...",HBV
1199,30,Male,"Yellow Skin, Fever, Fatigue, Yellowing Skin an...",HBV


In [8]:
#df=df.drop_duplicates(ignore_index = True)
#df.head()

In [9]:
# Split symptoms into lists
df['Symptoms'] = df['Symptoms'].str.split(',').apply(lambda x: [symptom.strip() for symptom in x])

# Get a set of all unique symptoms
all_symptoms = set(symptom for sublist in df['Symptoms'] for symptom in sublist)

In [10]:
all_symptoms = list(all_symptoms) # Convert the set to a list
len(all_symptoms)

20

In [11]:
df.head()

Unnamed: 0,Age,Gender,Symptoms,Diagnosis
0,57,Female,"[Backache, Vomiting, Bitter Taste, Frequent St...",HIV
1,41,Male,"[Frequent Stooling, Bitter Taste, Weight Loss,...",HIV
2,31,Female,"[Cough, Rashes, Weight Loss, Vomiting, Fever]",HIV
3,36,Female,"[Weight Loss, Vomiting, Bitter Taste, Cough]",HIV
4,44,Male,"[Fever, Weight Loss, Rashes, Bitter Taste]",HIV


In [12]:
#One hot encoding
# Initialize a DataFrame for the binary matrix
binary_df = pd.DataFrame(0, index=df.index, columns=all_symptoms)

# Populate the binary matrix
for idx, symptoms in enumerate(df['Symptoms']):
    binary_df.loc[idx, symptoms] = 1

# Display the binary matrix
binary_df.head()

Unnamed: 0,Frequent Stooling,Rashes,Yellowing Skin and Eyes,Vomiting,Headache,Chronic Cough,Anorexia,Fatigue,Fever,Bitterness of the Mouth,Malaise,Muscle and Joint Pain,Loss of Appetite,Cough,Weight Loss,Backache,Yellow Skin,Night Sweats,Bitter Taste,Dark Urine
0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0
4,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0


In [13]:
binary_df.iloc[3]

Frequent Stooling          0
Rashes                     0
Yellowing Skin and Eyes    0
Vomiting                   1
Headache                   0
Chronic Cough              0
Anorexia                   0
Fatigue                    0
Fever                      0
Bitterness of the Mouth    0
Malaise                    0
Muscle and Joint Pain      0
Loss of Appetite           0
Cough                      1
Weight Loss                1
Backache                   0
Yellow Skin                0
Night Sweats               0
Bitter Taste               1
Dark Urine                 0
Name: 3, dtype: int64

In [14]:
df.head()

Unnamed: 0,Age,Gender,Symptoms,Diagnosis
0,57,Female,"[Backache, Vomiting, Bitter Taste, Frequent St...",HIV
1,41,Male,"[Frequent Stooling, Bitter Taste, Weight Loss,...",HIV
2,31,Female,"[Cough, Rashes, Weight Loss, Vomiting, Fever]",HIV
3,36,Female,"[Weight Loss, Vomiting, Bitter Taste, Cough]",HIV
4,44,Male,"[Fever, Weight Loss, Rashes, Bitter Taste]",HIV


In [15]:
data = pd.concat([df.drop('Symptoms',axis = 1),binary_df],axis = 1)
data.tail(5)

Unnamed: 0,Age,Gender,Diagnosis,Frequent Stooling,Rashes,Yellowing Skin and Eyes,Vomiting,Headache,Chronic Cough,Anorexia,...,Malaise,Muscle and Joint Pain,Loss of Appetite,Cough,Weight Loss,Backache,Yellow Skin,Night Sweats,Bitter Taste,Dark Urine
1195,29,Male,HBV,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1196,69,Female,HBV,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1197,52,Male,HBV,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1198,51,Female,HBV,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1199,30,Male,HBV,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [16]:
data['Gender'] = data['Gender'].replace({'Male':0,'Female':1})
data.tail(5)

Unnamed: 0,Age,Gender,Diagnosis,Frequent Stooling,Rashes,Yellowing Skin and Eyes,Vomiting,Headache,Chronic Cough,Anorexia,...,Malaise,Muscle and Joint Pain,Loss of Appetite,Cough,Weight Loss,Backache,Yellow Skin,Night Sweats,Bitter Taste,Dark Urine
1195,29,0,HBV,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1196,69,1,HBV,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1197,52,0,HBV,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1198,51,1,HBV,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1199,30,0,HBV,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [17]:
x = data.drop('Diagnosis',axis = 1)
y = data['Diagnosis']

In [18]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 40)
print(len(x_train))
print(len(x_test))

960
240


In [19]:
x_train.head()

Unnamed: 0,Age,Gender,Frequent Stooling,Rashes,Yellowing Skin and Eyes,Vomiting,Headache,Chronic Cough,Anorexia,Fatigue,...,Malaise,Muscle and Joint Pain,Loss of Appetite,Cough,Weight Loss,Backache,Yellow Skin,Night Sweats,Bitter Taste,Dark Urine
866,61,1,0,0,0,0,0,0,1,0,...,1,0,1,0,0,0,0,0,0,0
652,25,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
975,38,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
318,56,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
31,21,0,0,1,0,0,0,0,0,0,...,0,0,0,1,1,1,0,0,1,0


In [20]:
x_train.columns

Index(['Age', 'Gender', 'Frequent Stooling', 'Rashes',
       'Yellowing Skin and Eyes', 'Vomiting', 'Headache', 'Chronic Cough',
       'Anorexia', 'Fatigue', 'Fever', 'Bitterness of the Mouth', 'Malaise',
       'Muscle and Joint Pain', 'Loss of Appetite', 'Cough', 'Weight Loss',
       'Backache', 'Yellow Skin', 'Night Sweats', 'Bitter Taste',
       'Dark Urine'],
      dtype='object')

In [21]:
#sweating at night, a general feeling of discomfort

In [22]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
print(classification_report(y_test,knn_pred))

              precision    recall  f1-score   support

         HBV       1.00      1.00      1.00        54
         HIV       1.00      1.00      1.00        59
     Malaria       1.00      1.00      1.00        72
          TB       1.00      1.00      1.00        55

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240



In [23]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_pred = rfc.predict(x_test)
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

         HBV       1.00      1.00      1.00        54
         HIV       1.00      1.00      1.00        59
     Malaria       1.00      1.00      1.00        72
          TB       1.00      1.00      1.00        55

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240



In [26]:
knn_pred

array(['TB', 'Malaria', 'HBV', 'Malaria', 'TB', 'HIV', 'HIV', 'HBV', 'TB',
       'HBV', 'Malaria', 'HBV', 'Malaria', 'HIV', 'Malaria', 'Malaria',
       'Malaria', 'Malaria', 'HIV', 'TB', 'Malaria', 'HBV', 'Malaria',
       'Malaria', 'HBV', 'TB', 'TB', 'Malaria', 'HIV', 'Malaria', 'HIV',
       'HIV', 'HBV', 'Malaria', 'TB', 'Malaria', 'TB', 'HIV', 'TB',
       'Malaria', 'HIV', 'HIV', 'HBV', 'TB', 'HIV', 'HBV', 'Malaria',
       'Malaria', 'HBV', 'Malaria', 'TB', 'TB', 'TB', 'Malaria', 'HIV',
       'Malaria', 'HBV', 'TB', 'HIV', 'HBV', 'TB', 'HBV', 'HBV', 'HIV',
       'Malaria', 'HIV', 'TB', 'TB', 'TB', 'Malaria', 'TB', 'HBV',
       'Malaria', 'HIV', 'Malaria', 'HBV', 'Malaria', 'TB', 'Malaria',
       'HBV', 'HBV', 'HBV', 'HBV', 'HBV', 'TB', 'HIV', 'Malaria',
       'Malaria', 'HBV', 'HIV', 'Malaria', 'Malaria', 'TB', 'HBV', 'HBV',
       'TB', 'HBV', 'TB', 'HIV', 'Malaria', 'HBV', 'HIV', 'TB', 'TB',
       'Malaria', 'TB', 'HBV', 'HBV', 'HIV', 'HIV', 'TB', 'HIV', 'TB',
       '

In [25]:
rfc_pred

array(['TB', 'Malaria', 'HBV', 'Malaria', 'TB', 'HIV', 'HIV', 'HBV', 'TB',
       'HBV', 'Malaria', 'HBV', 'Malaria', 'HIV', 'Malaria', 'Malaria',
       'Malaria', 'Malaria', 'HIV', 'TB', 'Malaria', 'HBV', 'Malaria',
       'Malaria', 'HBV', 'TB', 'TB', 'Malaria', 'HIV', 'Malaria', 'HIV',
       'HIV', 'HBV', 'Malaria', 'TB', 'Malaria', 'TB', 'HIV', 'TB',
       'Malaria', 'HIV', 'HIV', 'HBV', 'TB', 'HIV', 'HBV', 'Malaria',
       'Malaria', 'HBV', 'Malaria', 'TB', 'TB', 'TB', 'Malaria', 'HIV',
       'Malaria', 'HBV', 'TB', 'HIV', 'HBV', 'TB', 'HBV', 'HBV', 'HIV',
       'Malaria', 'HIV', 'TB', 'TB', 'TB', 'Malaria', 'TB', 'HBV',
       'Malaria', 'HIV', 'Malaria', 'HBV', 'Malaria', 'TB', 'Malaria',
       'HBV', 'HBV', 'HBV', 'HBV', 'HBV', 'TB', 'HIV', 'Malaria',
       'Malaria', 'HBV', 'HIV', 'Malaria', 'Malaria', 'TB', 'HBV', 'HBV',
       'TB', 'HBV', 'TB', 'HIV', 'Malaria', 'HBV', 'HIV', 'TB', 'TB',
       'Malaria', 'TB', 'HBV', 'HBV', 'HIV', 'HIV', 'TB', 'HIV', 'TB',
       '

In [23]:
data.columns

Index(['Age', 'Gender', 'Diagnosis', 'Headache', 'Muscle and Joint Pain',
       'Frequent Stooling', 'Yellow Skin', 'Night Sweats', 'Fever', 'Vomiting',
       'Loss of Appetite', 'Fatigue', 'Anorexia', 'Weight Loss',
       'Bitter Taste', 'Cough', 'Yellowing Skin and Eyes', 'Malaise',
       'Chronic Cough', 'Backache', 'Dark Urine', 'Rashes',
       'Bitterness of the Mouth'],
      dtype='object')

In [27]:
cat_features = ['Gender', 'Vomiting', 'Yellow Skin', 'Rashes',
       'Bitterness of the Mouth', 'Bitter Taste', 'Loss of Appetite',
       'Frequent Stooling', 'Fever', 'Weight Loss', 'Backache', 'Night Sweats',
       'Dark Urine', 'Muscle and Joint Pain', 'Headache', 'Chronic Cough',
       'Malaise', 'Fatigue', 'Cough', 'Anorexia', 'Yellowing Skin and Eyes']

In [28]:
cat= CatBoostClassifier(iterations=500, 
                           depth=6, 
                           learning_rate=0.1, 
                           loss_function='MultiClass', 
                           verbose=True)
# Training the model
cat.fit(x_train, y_train, cat_features=cat_features, eval_set=(x_test, y_test), plot=True)
# Making predictions
cat_pred = cat.predict(x_test)
print(classification_report(y_test,cat_pred))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.1612172	test: 1.1597045	best: 1.1597045 (0)	total: 178ms	remaining: 1m 29s
1:	learn: 1.0022533	test: 0.9984672	best: 0.9984672 (1)	total: 197ms	remaining: 49.1s
2:	learn: 0.8739890	test: 0.8702799	best: 0.8702799 (2)	total: 216ms	remaining: 35.8s
3:	learn: 0.7618962	test: 0.7582269	best: 0.7582269 (3)	total: 236ms	remaining: 29.2s
4:	learn: 0.6774657	test: 0.6725363	best: 0.6725363 (4)	total: 255ms	remaining: 25.2s
5:	learn: 0.6032112	test: 0.6008142	best: 0.6008142 (5)	total: 274ms	remaining: 22.5s
6:	learn: 0.5422613	test: 0.5429444	best: 0.5429444 (6)	total: 294ms	remaining: 20.7s
7:	learn: 0.4854244	test: 0.4887056	best: 0.4887056 (7)	total: 315ms	remaining: 19.3s
8:	learn: 0.4379987	test: 0.4418673	best: 0.4418673 (8)	total: 337ms	remaining: 18.4s
9:	learn: 0.3956680	test: 0.3996826	best: 0.3996826 (9)	total: 358ms	remaining: 17.5s
10:	learn: 0.3579744	test: 0.3637370	best: 0.3637370 (10)	total: 592ms	remaining: 26.3s
11:	learn: 0.3247602	test: 0.3300750	best: 0.330075

In [26]:
scores = cross_val_score(cat, x, y, cv=5)
print(f"Accuracy Scores for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

0:	learn: 1.1452947	total: 24.4ms	remaining: 12.2s
1:	learn: 0.9902319	total: 47.5ms	remaining: 11.8s
2:	learn: 0.8873942	total: 92.6ms	remaining: 15.3s
3:	learn: 0.7836288	total: 121ms	remaining: 15s
4:	learn: 0.6983244	total: 147ms	remaining: 14.5s
5:	learn: 0.6171983	total: 173ms	remaining: 14.2s
6:	learn: 0.5511940	total: 219ms	remaining: 15.5s
7:	learn: 0.4944180	total: 273ms	remaining: 16.8s
8:	learn: 0.4435991	total: 321ms	remaining: 17.5s
9:	learn: 0.4021203	total: 349ms	remaining: 17.1s
10:	learn: 0.3659870	total: 381ms	remaining: 16.9s
11:	learn: 0.3328541	total: 406ms	remaining: 16.5s
12:	learn: 0.3022789	total: 430ms	remaining: 16.1s
13:	learn: 0.2765938	total: 453ms	remaining: 15.7s
14:	learn: 0.2554059	total: 492ms	remaining: 15.9s
15:	learn: 0.2327389	total: 564ms	remaining: 17.1s
16:	learn: 0.2135934	total: 611ms	remaining: 17.3s
17:	learn: 0.1957278	total: 637ms	remaining: 17.1s
18:	learn: 0.1799888	total: 661ms	remaining: 16.7s
19:	learn: 0.1667228	total: 697ms	remain

In [29]:
scores = cross_val_score(rfc, x, y, cv=5)
print(f"Accuracy Scores for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

Accuracy Scores for each fold: [1. 1. 1. 1. 1.]
Mean Accuracy: 1.0


In [30]:
scores = cross_val_score(knn, x, y, cv=5)
print(f"Accuracy Scores for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

Accuracy Scores for each fold: [1.         0.99583333 1.         0.99583333 0.9875    ]
Mean Accuracy: 0.9958333333333333


In [30]:
import pickle 
pickle.dump(cat,open('model/model.pkl','wb'))
