In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier

In [36]:
data = pd.read_csv("D:/jup/Disease_symptom_and_patient_profile_dataset.csv")
data.head()

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Disease               349 non-null    object
 1   Fever                 349 non-null    object
 2   Cough                 349 non-null    object
 3   Fatigue               349 non-null    object
 4   Difficulty Breathing  349 non-null    object
 5   Age                   349 non-null    int64 
 6   Gender                349 non-null    object
 7   Blood Pressure        349 non-null    object
 8   Cholesterol Level     349 non-null    object
 9   Outcome Variable      349 non-null    object
dtypes: int64(1), object(9)
memory usage: 27.4+ KB


In [38]:
data.duplicated().sum()

49

In [39]:
data = data.drop_duplicates(keep='first')
data.duplicated().sum()

0

In [40]:
data.reset_index(drop=True, inplace=True)
data.head()

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Eczema,Yes,No,No,No,25,Female,Normal,Normal,Positive


In [41]:
data = data.drop(columns=['Disease'])
data.head()

Unnamed: 0,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Yes,No,No,No,25,Female,Normal,Normal,Positive


In [42]:
data_encoded = data.copy()
data_encoded.head()

Unnamed: 0,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Yes,No,No,No,25,Female,Normal,Normal,Positive


In [43]:
encoders = {col: LabelEncoder() for col in data.columns if data[col].dtype == 'O'}
encoders

{'Fever': LabelEncoder(),
 'Cough': LabelEncoder(),
 'Fatigue': LabelEncoder(),
 'Difficulty Breathing': LabelEncoder(),
 'Gender': LabelEncoder(),
 'Blood Pressure': LabelEncoder(),
 'Cholesterol Level': LabelEncoder(),
 'Outcome Variable': LabelEncoder()}

In [44]:
for col in data.columns:
    if data[col].dtype == 'O':
        data_encoded[col] = encoders[col].fit_transform(data[col])
data_encoded.head()

Unnamed: 0,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,1,0,1,1,19,0,1,2,1
1,0,1,1,0,25,0,2,2,0
2,0,1,1,0,25,0,2,2,0
3,1,1,0,1,25,1,2,2,1
4,1,0,0,0,25,0,2,2,1


In [45]:
X = data_encoded.drop(columns=['Outcome Variable'])
y = data_encoded['Outcome Variable']

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [47]:
X_train.shape

(240, 8)

In [48]:
X_test.shape

(60, 8)

In [49]:
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.5,
    random_state = 42)

In [50]:
xgb.fit(X_train, y_train)

In [51]:
xgb.score(X_test , y_test)

0.75

In [52]:
y_pred = xgb.predict(X_test)
accuracy_score(y_test, y_pred)

0.75

In [53]:
# test of unknown data

In [54]:
for col in data.columns:
    if data[col].dtype == 'O':
        print(f"{col}: {data[col].unique()}\n")

Fever: ['Yes' 'No']

Cough: ['No' 'Yes']

Fatigue: ['Yes' 'No']

Difficulty Breathing: ['Yes' 'No']

Gender: ['Female' 'Male']

Blood Pressure: ['Low' 'Normal' 'High']

Cholesterol Level: ['Normal' 'Low' 'High']

Outcome Variable: ['Positive' 'Negative']



In [55]:
new_data = pd.DataFrame([['No', 'Yes', 'No', 'No', 50, 'Male', 'Low', 'High']], columns=['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Age', 'Gender', 'Blood Pressure', 'Cholesterol Level'], )
new_data

Unnamed: 0,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level
0,No,Yes,No,No,50,Male,Low,High


In [56]:
new_data_encoded = new_data.copy()
for col in data.columns:
    if data[col].dtype == 'O' and col != 'Outcome Variable':
        new_data_encoded[col] = encoders[col].transform(new_data[col])
new_data_encoded

Unnamed: 0,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level
0,0,1,0,0,50,1,1,0


In [57]:
new_pred = xgb.predict(new_data_encoded)
new_pred

array([0])

In [58]:
if new_pred == 1: # Positive
    print("🔴 You must consult a doctor")
else:
    print("🟢 Normal")

🟢 Normal


In [59]:
# to test test_data manually 
test_data = pd.concat([X_test, y_test],axis=1)
for col in data.columns:
    if data[col].dtype == 'O':
        test_data[col] = encoders[col].inverse_transform(test_data[col])

for index, row in test_data.iterrows():
    print(f"{index} : ",", ".join(f"'{value}'" if isinstance(value, str) else str(value) for value in row.values))


273 :  'Yes', 'No', 'Yes', 'Yes', 65, 'Male', 'High', 'High', 'Positive'
68 :  'No', 'Yes', 'No', 'No', 35, 'Female', 'High', 'High', 'Positive'
115 :  'No', 'No', 'Yes', 'No', 40, 'Female', 'High', 'High', 'Positive'
256 :  'Yes', 'Yes', 'Yes', 'No', 60, 'Female', 'High', 'Normal', 'Positive'
74 :  'No', 'No', 'Yes', 'No', 35, 'Male', 'High', 'High', 'Positive'
184 :  'Yes', 'Yes', 'Yes', 'No', 50, 'Male', 'Normal', 'High', 'Negative'
207 :  'No', 'No', 'Yes', 'No', 50, 'Female', 'High', 'High', 'Positive'
103 :  'No', 'No', 'Yes', 'Yes', 40, 'Male', 'High', 'High', 'Negative'
241 :  'Yes', 'Yes', 'No', 'No', 60, 'Male', 'High', 'High', 'Negative'
72 :  'No', 'No', 'Yes', 'No', 35, 'Male', 'High', 'High', 'Positive'
149 :  'Yes', 'No', 'Yes', 'No', 45, 'Male', 'High', 'Normal', 'Negative'
55 :  'Yes', 'Yes', 'Yes', 'No', 35, 'Male', 'High', 'Normal', 'Negative'
157 :  'No', 'Yes', 'Yes', 'No', 45, 'Female', 'High', 'High', 'Negative'
15 :  'Yes', 'Yes', 'Yes', 'Yes', 29, 'Male', 'High

In [60]:
# testing adaboost's accuracy 

In [61]:
adc = AdaBoostClassifier(n_estimators=50, learning_rate=0.1, algorithm='SAMME', random_state=42)

In [62]:
adc.fit(X_train, y_train)

In [63]:
adc.score(X_test, y_test)

0.7