In [1]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('hypothyroid.csv', na_values = '?')
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
0,41.0,F,f,f,f,f,f,f,f,f,...,t,125.0,t,1.14,t,109.0,f,,SVHC,P
1,23.0,F,f,f,f,f,f,f,f,f,...,t,102.0,f,,f,,f,,other,P
2,46.0,M,f,f,f,f,f,f,f,f,...,t,109.0,t,0.91,t,120.0,f,,other,P
3,70.0,F,t,f,f,f,f,f,f,f,...,t,175.0,f,,f,,f,,other,P
4,70.0,F,f,f,f,f,f,f,f,f,...,t,61.0,t,0.87,t,70.0,f,,SVI,P


In [6]:
# display all column names
list(df)

['age',
 'sex',
 'on thyroxine',
 'query on thyroxine',
 'on antithyroid medication',
 'sick',
 'pregnant',
 'thyroid surgery',
 'I131 treatment',
 'query hypothyroid',
 'query hyperthyroid',
 'lithium',
 'goitre',
 'tumor',
 'hypopituitary',
 'psych',
 'TSH measured',
 'TSH',
 'T3 measured',
 'T3',
 'TT4 measured',
 'TT4',
 'T4U measured',
 'T4U',
 'FTI measured',
 'FTI',
 'TBG measured',
 'TBG',
 'referral source',
 'binaryClass']

In [7]:
# rename column names more appropriately (without spaces, shorter)
df1 = df.rename({'on thyroxine':'on_thyroxine',
           'query on thyroxine':'q_on_thyroxine',
           'on antithyroid medication':'on_antithyroid_medi',
           'thyroid surgery':'thyroid_surg',
           'I131 treatment':'I131_trtmt',
           'query hypothyroid': 'q_hypothyroid',
           'query hyperthyroid': 'q_hyperthyroid',
           'TSH measured':'TSH_m',
           'T3 measured': 'T3_m',
           'TT4 measured': 'TT4_m',
           'T4U measured': 'T4U_m',
           'FTI measured': 'FTI_m',
           'TBG measured': 'TBG_m',
           'referral source': 'refer_src',
           'binaryClass': 'target'
          }, axis=1)

In [8]:
# check for any null values
df1.isnull().sum()  # null values stated by '?'

age                       1
sex                     150
on_thyroxine              0
q_on_thyroxine            0
on_antithyroid_medi       0
sick                      0
pregnant                  0
thyroid_surg              0
I131_trtmt                0
q_hypothyroid             0
q_hyperthyroid            0
lithium                   0
goitre                    0
tumor                     0
hypopituitary             0
psych                     0
TSH_m                     0
TSH                     369
T3_m                      0
T3                      769
TT4_m                     0
TT4                     231
T4U_m                     0
T4U                     387
FTI_m                     0
FTI                     385
TBG_m                     0
TBG                    3772
refer_src                 0
target                    0
dtype: int64

In [9]:
# see how many entries (rows) and features (columns)
df1.shape

(3772, 30)

In [10]:
# describes the data for each column 
df1.describe()

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,TBG
count,3771.0,3403.0,3003.0,3541.0,3385.0,3387.0,0.0
mean,51.735879,5.086766,2.0135,108.319345,0.995,110.469649,
std,20.084958,24.52147,0.827434,35.604248,0.195457,33.089698,
min,1.0,0.005,0.05,2.0,0.25,2.0,
25%,36.0,0.5,1.6,88.0,0.88,93.0,
50%,54.0,1.4,2.0,103.0,0.98,107.0,
75%,67.0,2.7,2.4,124.0,1.08,124.0,
max,455.0,530.0,10.6,430.0,2.32,395.0,


In [11]:
# displays number of unique values for each feature
df1.nunique()

age                     93
sex                      2
on_thyroxine             2
q_on_thyroxine           2
on_antithyroid_medi      2
sick                     2
pregnant                 2
thyroid_surg             2
I131_trtmt               2
q_hypothyroid            2
q_hyperthyroid           2
lithium                  2
goitre                   2
tumor                    2
hypopituitary            2
psych                    2
TSH_m                    2
TSH                    287
T3_m                     2
T3                      69
TT4_m                    2
TT4                    241
T4U_m                    2
T4U                    146
FTI_m                    2
FTI                    234
TBG_m                    1
TBG                      0
refer_src                5
target                   2
dtype: int64

In [12]:
# drop rows with missing values

# axis:  0 - to drop rows, 1 - to drop columns

new_df = df1.dropna(how='any',
                  subset=['sex'])  # drop rows which have missing sex values

In [13]:
new_df.isnull().sum()  # shows there are no more rows with missing sex values

age                       1
sex                       0
on_thyroxine              0
q_on_thyroxine            0
on_antithyroid_medi       0
sick                      0
pregnant                  0
thyroid_surg              0
I131_trtmt                0
q_hypothyroid             0
q_hyperthyroid            0
lithium                   0
goitre                    0
tumor                     0
hypopituitary             0
psych                     0
TSH_m                     0
TSH                     352
T3_m                      0
T3                      745
TT4_m                     0
TT4                     217
T4U_m                     0
T4U                     367
FTI_m                     0
FTI                     365
TBG_m                     0
TBG                    3622
refer_src                 0
target                    0
dtype: int64

In [14]:
new_df.shape

(3622, 30)

In [15]:
# drop columns that contain only missing values
df_test = new_df.dropna( axis=1, how='all')

In [16]:
df_test.shape

(3622, 29)

In [17]:
df_test

Unnamed: 0,age,sex,on_thyroxine,q_on_thyroxine,on_antithyroid_medi,sick,pregnant,thyroid_surg,I131_trtmt,q_hypothyroid,...,T3,TT4_m,TT4,T4U_m,T4U,FTI_m,FTI,TBG_m,refer_src,target
0,41.0,F,f,f,f,f,f,f,f,f,...,2.5,t,125.0,t,1.14,t,109.0,f,SVHC,P
1,23.0,F,f,f,f,f,f,f,f,f,...,2.0,t,102.0,f,,f,,f,other,P
2,46.0,M,f,f,f,f,f,f,f,f,...,,t,109.0,t,0.91,t,120.0,f,other,P
3,70.0,F,t,f,f,f,f,f,f,f,...,1.9,t,175.0,f,,f,,f,other,P
4,70.0,F,f,f,f,f,f,f,f,f,...,1.2,t,61.0,t,0.87,t,70.0,f,SVI,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30.0,F,f,f,f,f,f,f,f,f,...,,f,,f,,f,,f,other,P
3768,68.0,F,f,f,f,f,f,f,f,f,...,2.1,t,124.0,t,1.08,t,114.0,f,SVI,P
3769,74.0,F,f,f,f,f,f,f,f,f,...,1.8,t,112.0,t,1.07,t,105.0,f,other,P
3770,72.0,M,f,f,f,f,f,f,f,f,...,2.0,t,82.0,t,0.94,t,87.0,f,SVI,P


In [18]:
# Assign independent and dependent variables, dropping unecessary columns
X = df_test.drop(columns=['TSH_m', 'T3_m', 'TT4_m', 'T4U_m', 'FTI_m',
                          'TBG_m', 'refer_src' , 'target'])
y = df_test['target']
# split data before further preprocessing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                   random_state=1)

In [19]:
# reset indexes of train and test sets

dfs = [X_train, y_train, X_test, y_test]

for data in dfs:
    data.reset_index(drop=True, inplace=True)

In [20]:
X_train.isnull().sum()

age                      1
sex                      0
on_thyroxine             0
q_on_thyroxine           0
on_antithyroid_medi      0
sick                     0
pregnant                 0
thyroid_surg             0
I131_trtmt               0
q_hypothyroid            0
q_hyperthyroid           0
lithium                  0
goitre                   0
tumor                    0
hypopituitary            0
psych                    0
TSH                    262
T3                     562
TT4                    163
T4U                    282
FTI                    281
dtype: int64

In [21]:
print(df1['sex'].value_counts())

F    2480
M    1142
Name: sex, dtype: int64


In [22]:
# dealing with missing values through imputation
from sklearn.impute import SimpleImputer, KNNImputer

Simp_mean = SimpleImputer(strategy='mean')

measures = ['TSH', 'T3', 'TT4', 'T4U', 'FTI']
# impute all missing metric values using average(mean)
for measure in measures:
    X_train[f'{measure}'] = Simp_mean.fit_transform(X_train[[measure]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[f'{measure}'] = Simp_mean.fit_transform(X_train[[measure]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[f'{measure}'] = Simp_mean.fit_transform(X_train[[measure]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[f'{measure}'] = Simp_mean.fit_transform(X_train[[measure]])
A

In [23]:
X_train.isnull().sum()

age                    1
sex                    0
on_thyroxine           0
q_on_thyroxine         0
on_antithyroid_medi    0
sick                   0
pregnant               0
thyroid_surg           0
I131_trtmt             0
q_hypothyroid          0
q_hyperthyroid         0
lithium                0
goitre                 0
tumor                  0
hypopituitary          0
psych                  0
TSH                    0
T3                     0
TT4                    0
T4U                    0
FTI                    0
dtype: int64

In [24]:
# drop single row with missing age value
X_train = X_train.dropna(how='any', subset=['age'])

In [25]:
X_train.head()

Unnamed: 0,age,sex,on_thyroxine,q_on_thyroxine,on_antithyroid_medi,sick,pregnant,thyroid_surg,I131_trtmt,q_hypothyroid,...,lithium,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI
0,39.0,M,f,f,f,f,f,f,f,f,...,f,f,f,f,t,1.0,1.9,83.0,0.9,92.0
1,75.0,M,f,f,f,f,f,f,f,f,...,f,f,f,f,f,0.035,1.2,98.0,0.83,118.0
2,40.0,M,f,f,f,f,f,f,f,f,...,f,f,f,f,f,1.3,2.3,87.0,0.96,90.0
3,75.0,F,t,f,f,f,f,f,f,f,...,f,f,f,f,f,1.6,1.6,102.0,0.94,109.0
4,72.0,F,f,f,f,f,f,f,f,f,...,f,f,f,f,f,0.015,2.9,198.0,0.91,217.0


In [26]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

le = LabelEncoder()
le.fit(X_train['on_thyroxine'])

LabelEncoder()

In [27]:
le.classes_

array(['f', 't'], dtype=object)

In [28]:
le.transform(['f', 't'])

array([0, 1])

In [29]:
list(X_train)

['age',
 'sex',
 'on_thyroxine',
 'q_on_thyroxine',
 'on_antithyroid_medi',
 'sick',
 'pregnant',
 'thyroid_surg',
 'I131_trtmt',
 'q_hypothyroid',
 'q_hyperthyroid',
 'lithium',
 'goitre',
 'tumor',
 'hypopituitary',
 'psych',
 'TSH',
 'T3',
 'TT4',
 'T4U',
 'FTI']

In [30]:
# encode binary columns
ft_col = ['on_thyroxine', 'q_on_thyroxine', 'on_antithyroid_medi',
          'sick', 'pregnant', 'thyroid_surg', 'I131_trtmt',
          'q_hypothyroid', 'q_hyperthyroid', 'lithium', 'goitre',
          'tumor', 'hypopituitary', 'psych']
le.fit(X_train['on_thyroxine'])
for col in ft_col:
    if type(X_train.loc[0, col]) == str: #checks if already encoded
        X_train[col] = le.transform(X_train[col])
     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = le.transform(X_train[col])


In [31]:
le.fit(y_train)
if type(y_train[0]) == str:
    y_train = le.transform(y_train)

Next step:
OneHotEncode sex column

In [32]:
X_train

Unnamed: 0,age,sex,on_thyroxine,q_on_thyroxine,on_antithyroid_medi,sick,pregnant,thyroid_surg,I131_trtmt,q_hypothyroid,...,lithium,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI
0,39.0,M,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1.000,1.900000,83.0,0.90,92.0
1,75.0,M,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.035,1.200000,98.0,0.83,118.0
2,40.0,M,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1.300,2.300000,87.0,0.96,90.0
3,75.0,F,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1.600,1.600000,102.0,0.94,109.0
4,72.0,F,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.015,2.900000,198.0,0.91,217.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2711,62.0,F,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0.025,2.200000,183.0,1.03,178.0
2712,59.0,F,0,0,0,0,0,0,0,1,...,0,0,0,0,0,2.800,2.005938,134.0,1.05,128.0
2713,71.0,F,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1.300,0.900000,126.0,0.85,148.0
2714,24.0,F,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.400,2.005938,142.0,1.20,118.0


In [33]:
sex_ohe = pd.get_dummies(X_train['sex'])

In [34]:
sex_ohe

Unnamed: 0,F,M
0,0,1
1,0,1
2,0,1
3,1,0
4,1,0
...,...,...
2711,1,0
2712,1,0
2713,1,0
2714,1,0


In [35]:
X_train['sex'] = sex_ohe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['sex'] = sex_ohe


In [39]:
pd.concat([X_train, sex_ohe], axis=1)

Unnamed: 0,age,sex,on_thyroxine,q_on_thyroxine,on_antithyroid_medi,sick,pregnant,thyroid_surg,I131_trtmt,q_hypothyroid,...,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,F,M
0,39.0,0,0,0,0,0,0,0,0,0,...,0,0,1,1.000,1.900000,83.0,0.90,92.0,0,1
1,75.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.035,1.200000,98.0,0.83,118.0,0,1
2,40.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1.300,2.300000,87.0,0.96,90.0,0,1
3,75.0,1,1,0,0,0,0,0,0,0,...,0,0,0,1.600,1.600000,102.0,0.94,109.0,1,0
4,72.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0.015,2.900000,198.0,0.91,217.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2711,62.0,1,1,1,0,0,0,0,0,0,...,0,0,0,0.025,2.200000,183.0,1.03,178.0,1,0
2712,59.0,1,0,0,0,0,0,0,0,1,...,0,0,0,2.800,2.005938,134.0,1.05,128.0,1,0
2713,71.0,1,0,0,0,0,0,0,0,0,...,0,0,0,1.300,0.900000,126.0,0.85,148.0,1,0
2714,24.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0.400,2.005938,142.0,1.20,118.0,1,0


In [30]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)
ohe.fit(X_train[['sex']])



In [40]:
X_train.shape

(2715, 21)

In [123]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)
ohe.fit(X_train[['sex']])

OneHotEncoder(sparse=False)

In [124]:
ohe.transform(X_train[['sex']])

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [135]:
if type(X_train.loc[0, 'sex']) == str:
    X_train[['sex']] = ohe.transform(X_train[['sex']])

In [139]:
X_train

Unnamed: 0,age,sex,on_thyroxine,q_on_thyroxine,on_antithyroid_medi,sick,pregnant,thyroid_surg,I131_trtmt,q_hypothyroid,...,lithium,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI
0,39.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1.000,1.900000,83.0,0.90,92.0
1,75.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.035,1.200000,98.0,0.83,118.0
2,40.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1.300,2.300000,87.0,0.96,90.0
3,75.0,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1.600,1.600000,102.0,0.94,109.0
4,72.0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.015,2.900000,198.0,0.91,217.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2710,62.0,1.0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0.025,2.200000,183.0,1.03,178.0
2711,59.0,1.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,2.800,2.005938,134.0,1.05,128.0
2712,71.0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1.300,0.900000,126.0,0.85,148.0
2713,24.0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.400,2.005938,142.0,1.20,118.0


In [136]:
X_train.loc[0, 'sex']
# get_dummies could potentially be used to 
# visualise the one-hot-encoded columns

0.0

In [8]:
import seaborn as sns

correlation = X.corr()

In [9]:
sns.heatmap(correlation, xticklabels=correlation.columns, 
            yticklabels=correlation.columns, annot=True)

ValueError: zero-size array to reduction operation fmin which has no identity

In [10]:
X_new = X.drop(columns=['on thyroxine', 'query on thyroxine', 'on antithyroid medication','sick'])

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TSH measured,TSH,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI
0,41,F,f,f,f,f,f,f,f,f,...,t,1.3,t,2.5,t,125,t,1.14,t,109
1,23,F,f,f,f,f,f,f,f,f,...,t,4.1,t,2,t,102,f,?,f,?
2,46,M,f,f,f,f,f,f,f,f,...,t,0.98,f,?,t,109,t,0.91,t,120
3,70,F,t,f,f,f,f,f,f,f,...,t,0.16,t,1.9,t,175,f,?,f,?
4,70,F,f,f,f,f,f,f,f,f,...,t,0.72,t,1.2,t,61,t,0.87,t,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,f,?,f,?
3768,68,F,f,f,f,f,f,f,f,f,...,t,1,t,2.1,t,124,t,1.08,t,114
3769,74,F,f,f,f,f,f,f,f,f,...,t,5.1,t,1.8,t,112,t,1.07,t,105
3770,72,M,f,f,f,f,f,f,f,f,...,t,0.7,t,2,t,82,t,0.94,t,87
