In [50]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report,f1_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

In [4]:
df = Thy_Orig_Df = pd. read_csv("hypothyroid.csv")

In [5]:
Thy_Orig_Df.shape

(3772, 30)

In [6]:
df.columns

Index(['age', 'sex', 'on thyroxine', 'query on thyroxine',
       'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery',
       'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'TSH',
       'T3 measured', 'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U',
       'FTI measured', 'FTI', 'TBG measured', 'TBG', 'referral source',
       'binaryClass'],
      dtype='object')

In [7]:
df['binaryClass'].unique()

array(['P', 'N'], dtype=object)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   age                        3772 non-null   object
 1   sex                        3772 non-null   object
 2   on thyroxine               3772 non-null   object
 3   query on thyroxine         3772 non-null   object
 4   on antithyroid medication  3772 non-null   object
 5   sick                       3772 non-null   object
 6   pregnant                   3772 non-null   object
 7   thyroid surgery            3772 non-null   object
 8   I131 treatment             3772 non-null   object
 9   query hypothyroid          3772 non-null   object
 10  query hyperthyroid         3772 non-null   object
 11  lithium                    3772 non-null   object
 12  goitre                     3772 non-null   object
 13  tumor                      3772 non-null   object
 14  hypopitu

In [9]:
df.describe()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
count,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,...,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772
unique,94,3,2,2,2,2,2,2,2,2,...,2,242,2,147,2,235,1,1,5,2
top,59,F,f,f,f,f,f,f,f,f,...,t,?,t,?,t,?,f,?,other,P
freq,95,2480,3308,3722,3729,3625,3719,3719,3713,3538,...,3541,231,3385,387,3387,385,3772,3772,2201,3481


In [10]:
#####Data Cleaning#####

In [11]:
df.replace('?', np.nan, inplace = True)

In [12]:
df.drop(['T3 measured','TSH measured','TT4 measured','T4U measured','FTI measured','TBG measured','TBG','referral source','on thyroxine','query on thyroxine','on antithyroid medication','query hypothyroid', 'query hyperthyroid','hypopituitary', 'psych'],axis=1,inplace=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              3771 non-null   object
 1   sex              3622 non-null   object
 2   sick             3772 non-null   object
 3   pregnant         3772 non-null   object
 4   thyroid surgery  3772 non-null   object
 5   I131 treatment   3772 non-null   object
 6   lithium          3772 non-null   object
 7   goitre           3772 non-null   object
 8   tumor            3772 non-null   object
 9   TSH              3403 non-null   object
 10  T3               3003 non-null   object
 11  TT4              3541 non-null   object
 12  T4U              3385 non-null   object
 13  FTI              3387 non-null   object
 14  binaryClass      3772 non-null   object
dtypes: object(15)
memory usage: 442.2+ KB


In [14]:
cols = ['age','FTI','TSH','T3','TT4','T4U']
for i in cols:
    df[i] = pd.to_numeric(df[i])

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              3771 non-null   float64
 1   sex              3622 non-null   object 
 2   sick             3772 non-null   object 
 3   pregnant         3772 non-null   object 
 4   thyroid surgery  3772 non-null   object 
 5   I131 treatment   3772 non-null   object 
 6   lithium          3772 non-null   object 
 7   goitre           3772 non-null   object 
 8   tumor            3772 non-null   object 
 9   TSH              3403 non-null   float64
 10  T3               3003 non-null   float64
 11  TT4              3541 non-null   float64
 12  T4U              3385 non-null   float64
 13  FTI              3387 non-null   float64
 14  binaryClass      3772 non-null   object 
dtypes: float64(6), object(9)
memory usage: 442.2+ KB


In [16]:
#Handling Missing values

In [17]:
miss_cols = ['FTI','TSH','T3','TT4','T4U']
for i in miss_cols:
    df[i] = df[i].fillna(df[i].mean())

In [18]:
df['sex'] = df['sex'].fillna('f')

In [19]:
df.isnull().sum()

age                1
sex                0
sick               0
pregnant           0
thyroid surgery    0
I131 treatment     0
lithium            0
goitre             0
tumor              0
TSH                0
T3                 0
TT4                0
T4U                0
FTI                0
binaryClass        0
dtype: int64

In [20]:
df.dropna(inplace = True)

In [21]:
df.isnull().sum()

age                0
sex                0
sick               0
pregnant           0
thyroid surgery    0
I131 treatment     0
lithium            0
goitre             0
tumor              0
TSH                0
T3                 0
TT4                0
T4U                0
FTI                0
binaryClass        0
dtype: int64

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3771 entries, 0 to 3771
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              3771 non-null   float64
 1   sex              3771 non-null   object 
 2   sick             3771 non-null   object 
 3   pregnant         3771 non-null   object 
 4   thyroid surgery  3771 non-null   object 
 5   I131 treatment   3771 non-null   object 
 6   lithium          3771 non-null   object 
 7   goitre           3771 non-null   object 
 8   tumor            3771 non-null   object 
 9   TSH              3771 non-null   float64
 10  T3               3771 non-null   float64
 11  TT4              3771 non-null   float64
 12  T4U              3771 non-null   float64
 13  FTI              3771 non-null   float64
 14  binaryClass      3771 non-null   object 
dtypes: float64(6), object(9)
memory usage: 471.4+ KB


In [23]:
#One hot encoder
df['sex'].dtype

dtype('O')

In [24]:
df.columns

Index(['age', 'sex', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment',
       'lithium', 'goitre', 'tumor', 'TSH', 'T3', 'TT4', 'T4U', 'FTI',
       'binaryClass'],
      dtype='object')

In [25]:
#To find categorical columns
cat_cols = []
for col in df.columns:
    if df[col].dtype == 'O':
        cat_cols.append(col)
cat_cols.remove('binaryClass')
cat_cols

['sex',
 'sick',
 'pregnant',
 'thyroid surgery',
 'I131 treatment',
 'lithium',
 'goitre',
 'tumor']

In [26]:
#Assigning numerical value to category
for col in cat_cols:
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.codes

In [27]:
df = pd.get_dummies(df, columns=cat_cols)

In [28]:
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',4000)


In [29]:
df.columns

Index(['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'binaryClass', 'sex_0',
       'sex_1', 'sex_2', 'sick_0', 'sick_1', 'pregnant_0', 'pregnant_1',
       'thyroid surgery_0', 'thyroid surgery_1', 'I131 treatment_0',
       'I131 treatment_1', 'lithium_0', 'lithium_1', 'goitre_0', 'goitre_1',
       'tumor_0', 'tumor_1'],
      dtype='object')

In [30]:
df.drop(columns = ['sex_2', 'sick_1', 'pregnant_1', 'thyroid surgery_1','I131 treatment_1', 'lithium_1', 'goitre_1', 'tumor_1'], axis = 1, inplace = True)

In [31]:
df.columns


Index(['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'binaryClass', 'sex_0',
       'sex_1', 'sick_0', 'pregnant_0', 'thyroid surgery_0',
       'I131 treatment_0', 'lithium_0', 'goitre_0', 'tumor_0'],
      dtype='object')

In [32]:
df['binaryClass'] = df['binaryClass'].astype('category')
df['binaryClass'] = df['binaryClass'].cat.codes

In [33]:
df.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            3762, 3763, 3764, 3765, 3766, 3767, 3768, 3769, 3770, 3771],
           dtype='int64', length=3771)

In [34]:
df.index = list(np.arange(0,3771))

In [35]:
#Split the data frame
df_X = df.drop('binaryClass', axis = 1)
df_Y = df["binaryClass"]

In [36]:
ssplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)


In [37]:
for train_index,test_index in ssplit.split(df_X, df_Y):
    X_train, X_test = df_X.loc[train_index], df_X.loc[test_index]
    y_train, y_test = df_Y.loc[train_index], df_Y.loc[test_index]



In [38]:
numerical_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

In [39]:
#Scaling

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled = scaler.transform(X_test[numerical_cols])

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3771 entries, 0 to 3770
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                3771 non-null   float64
 1   TSH                3771 non-null   float64
 2   T3                 3771 non-null   float64
 3   TT4                3771 non-null   float64
 4   T4U                3771 non-null   float64
 5   FTI                3771 non-null   float64
 6   binaryClass        3771 non-null   int8   
 7   sex_0              3771 non-null   uint8  
 8   sex_1              3771 non-null   uint8  
 9   sick_0             3771 non-null   uint8  
 10  pregnant_0         3771 non-null   uint8  
 11  thyroid surgery_0  3771 non-null   uint8  
 12  I131 treatment_0   3771 non-null   uint8  
 13  lithium_0          3771 non-null   uint8  
 14  goitre_0           3771 non-null   uint8  
 15  tumor_0            3771 non-null   uint8  
dtypes: float64(6), int8(1), 

In [None]:
#Model building

In [81]:
models = {
    LogisticRegression(max_iter=500, fit_intercept= True):'Logistic Regression',
    SVC(kernel= 'rbf'):"Support Vector Machine",
    RandomForestClassifier():'Random Forest'
}


In [82]:
for m in models.keys():
    m.fit(X_train_scaled, y_train)
    print( m.score(X_train_scaled, y_train))

0.9512599469496021
0.9559018567639257
1.0


In [83]:
for model,name in models.items():
     print(f"Accuracy Score for {name} is : ",model.score(X_test_scaled,y_test)*100,"%")

Accuracy Score for Logistic Regression is :  94.83443708609272 %
Accuracy Score for Support Vector Machine is :  95.62913907284768 %
Accuracy Score for Random Forest is :  97.6158940397351 %


In [58]:
#Applying Grid Search CV
models = {
    LogisticRegression():'Logistic Regression',
    SVC():"Support Vector Machine",
    RandomForestClassifier():'Random Forest'
}
parameters = {'cv':[5]}

In [73]:
m = LogisticRegression()
parameters = {'max_iter':[500,600], 'fit_intercept': [True, False]}
clf = GridSearchCV(m, parameters,  scoring='accuracy')
clf.fit(X_train_scaled, y_train)
#print(f"Accuracy Score for Logistic is : ",clf.score(X_test_scaled,y_test)*100,"%")
print(f"Accuracy Score for Logistic is : ",clf.score(X_test_scaled,y_test)*100,"%")
print(clf.best_score_)
print(clf.best_params_)

Accuracy Score for Logistic is :  94.83443708609272 %
0.9519230558026643
{'fit_intercept': True, 'max_iter': 500}


In [80]:
m = SVC()
parameters = {'kernel': ['linear', 'rbf']}
clf = GridSearchCV(m, parameters,  scoring='accuracy')
clf.fit(X_train_scaled, y_train)
#print(f"Accuracy Score for Logistic is : ",clf.score(X_test_scaled,y_test)*100,"%")
print(f"Accuracy Score for SVM is : ",clf.score(X_test_scaled,y_test)*100,"%")
print(clf.best_score_)
print(clf.best_params_)

Accuracy Score for SVM is :  95.62913907284768 %
0.952254730761205
{'kernel': 'rbf'}
