In [4]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [5]:
# reading the dataset
data = pd.read_csv('/workspaces/thyroid-disease-detection/dataset/hypothyroid.csv')

In [6]:
# shape of the dataset
data.shape

(3772, 30)

In [7]:
# displaying the first five rows of the dataset
data.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,P
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,P
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,P
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,P
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,P


In [8]:
# information about the dataset 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   age                        3772 non-null   object
 1   sex                        3772 non-null   object
 2   on thyroxine               3772 non-null   object
 3   query on thyroxine         3772 non-null   object
 4   on antithyroid medication  3772 non-null   object
 5   sick                       3772 non-null   object
 6   pregnant                   3772 non-null   object
 7   thyroid surgery            3772 non-null   object
 8   I131 treatment             3772 non-null   object
 9   query hypothyroid          3772 non-null   object
 10  query hyperthyroid         3772 non-null   object
 11  lithium                    3772 non-null   object
 12  goitre                     3772 non-null   object
 13  tumor                      3772 non-null   object
 14  hypopitu

In [9]:
# description of the dataset
data.describe()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
count,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,...,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772
unique,94,3,2,2,2,2,2,2,2,2,...,2,242,2,147,2,235,1,1,5,2
top,59,F,f,f,f,f,f,f,f,f,...,t,?,t,?,t,?,f,?,other,P
freq,95,2480,3308,3722,3729,3625,3719,3719,3713,3538,...,3541,231,3385,387,3387,385,3772,3772,2201,3481


In [10]:
# dropping duplicated records
data.drop_duplicates(inplace=True)

In [11]:
# label encoding the categorical features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for i in data.columns:
    if data[i].dtype == 'object':
        data[i] = le.fit_transform(data[i])

In [12]:
# scaling the dataset
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
columns_to_scale = ['age', 'TT4', 'T4U', 'FTI']
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

In [13]:
# splitting the dataset into dependent and independent features
X = data.drop(['binaryClass'], axis=1)
y = data['binaryClass']

In [14]:
# splitting the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [15]:
# importing the machine learning model
from xgboost import XGBClassifier

In [16]:
# importing the evaluation metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [17]:
# creating a function to train and evaluate the models
def train_and_evaluate(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Accuracy Score: ', accuracy_score(y_test, y_pred))
    print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
    print('Classification Report: \n', classification_report(y_test, y_pred))

In [18]:
# training and evaluating the model
print('XGBoost Classifier: \n')
train_and_evaluate(XGBClassifier())

XGBoost Classifier: 

Accuracy Score:  0.9913793103448276
Confusion Matrix: 
 [[ 60   5]
 [  3 860]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.92      0.94        65
           1       0.99      1.00      1.00       863

    accuracy                           0.99       928
   macro avg       0.97      0.96      0.97       928
weighted avg       0.99      0.99      0.99       928

