In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
df3 = pd.read_csv('thyroid_filtered_categories.csv')

In [3]:
df3

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral_source,target,target_category
0,29,F,f,f,f,f,f,f,f,t,...,f,f,0.3,0.0,0.0,0.00,0.0,other,-,normal
1,29,F,f,f,f,f,f,f,f,f,...,f,f,1.6,1.9,128.0,0.00,0.0,other,-,normal
2,41,F,f,f,f,f,f,f,f,f,...,f,f,0.0,0.0,0.0,0.00,0.0,other,-,normal
3,36,F,f,f,f,f,f,f,f,f,...,f,f,0.0,0.0,0.0,0.00,0.0,other,-,normal
4,32,F,f,f,f,f,f,f,f,f,...,f,f,0.0,0.0,0.0,0.00,0.0,other,S,hyperthyroid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7789,70,F,f,f,f,f,f,f,f,f,...,f,f,0.0,0.0,88.0,0.74,119.0,SVI,-,normal
7790,56,M,f,f,f,f,f,f,f,f,...,f,f,0.0,0.0,64.0,0.83,77.0,SVI,-,normal
7791,22,M,f,f,f,f,f,f,f,f,...,f,f,0.0,0.0,91.0,0.92,99.0,SVI,-,normal
7792,47,F,f,f,f,f,f,f,f,f,...,f,f,0.0,0.0,75.0,0.85,88.0,other,-,normal


In [4]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7794 entries, 0 to 7793
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  7794 non-null   int64  
 1   sex                  7794 non-null   object 
 2   on_thyroxine         7794 non-null   object 
 3   query_on_thyroxine   7794 non-null   object 
 4   on_antithyroid_meds  7794 non-null   object 
 5   sick                 7794 non-null   object 
 6   pregnant             7794 non-null   object 
 7   thyroid_surgery      7794 non-null   object 
 8   I131_treatment       7794 non-null   object 
 9   query_hypothyroid    7794 non-null   object 
 10  query_hyperthyroid   7794 non-null   object 
 11  lithium              7794 non-null   object 
 12  goitre               7794 non-null   object 
 13  tumor                7794 non-null   object 
 14  hypopituitary        7794 non-null   object 
 15  psych                7794 non-null   o

In [6]:
obj_col = []
for i in df3.columns:
    if df3[i].dtype == 'object':
        print(f'Unique values in {i}: {df3[i].unique()}')
        obj_col.append(i)

Unique values in sex: ['F' 'M']
Unique values in on_thyroxine: ['f' 't']
Unique values in query_on_thyroxine: ['f' 't']
Unique values in on_antithyroid_meds: ['f' 't']
Unique values in sick: ['f' 't']
Unique values in pregnant: ['f' 't']
Unique values in thyroid_surgery: ['f' 't']
Unique values in I131_treatment: ['f' 't']
Unique values in query_hypothyroid: ['t' 'f']
Unique values in query_hyperthyroid: ['f' 't']
Unique values in lithium: ['f' 't']
Unique values in goitre: ['f' 't']
Unique values in tumor: ['f' 't']
Unique values in hypopituitary: ['f']
Unique values in psych: ['f' 't']
Unique values in referral_source: ['other' 'SVI' 'SVHC' 'STMW' 'SVHD' 'WEST']
Unique values in target: ['-' 'S' 'F' 'AK' 'G' 'A' 'Q' 'C|I' 'O' 'H|K' 'D' 'GK' 'P' 'FK' 'B' 'GI'
 'C' 'GKJ' 'OI' 'D|R' 'E']
Unique values in target_category: ['normal' 'hyperthyroid' 'hypothyroid']


In [7]:
obj_col

['sex',
 'on_thyroxine',
 'query_on_thyroxine',
 'on_antithyroid_meds',
 'sick',
 'pregnant',
 'thyroid_surgery',
 'I131_treatment',
 'query_hypothyroid',
 'query_hyperthyroid',
 'lithium',
 'goitre',
 'tumor',
 'hypopituitary',
 'psych',
 'referral_source',
 'target',
 'target_category']

In [9]:
df3.drop(columns=['target'], inplace=True)
df3.columns

Index(['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_meds', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U',
       'FTI', 'referral_source', 'target_category'],
      dtype='object')

In [10]:
df3

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral_source,target_category
0,29,F,f,f,f,f,f,f,f,t,...,f,f,f,0.3,0.0,0.0,0.00,0.0,other,normal
1,29,F,f,f,f,f,f,f,f,f,...,f,f,f,1.6,1.9,128.0,0.00,0.0,other,normal
2,41,F,f,f,f,f,f,f,f,f,...,f,f,f,0.0,0.0,0.0,0.00,0.0,other,normal
3,36,F,f,f,f,f,f,f,f,f,...,f,f,f,0.0,0.0,0.0,0.00,0.0,other,normal
4,32,F,f,f,f,f,f,f,f,f,...,f,f,f,0.0,0.0,0.0,0.00,0.0,other,hyperthyroid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7789,70,F,f,f,f,f,f,f,f,f,...,f,f,f,0.0,0.0,88.0,0.74,119.0,SVI,normal
7790,56,M,f,f,f,f,f,f,f,f,...,f,f,f,0.0,0.0,64.0,0.83,77.0,SVI,normal
7791,22,M,f,f,f,f,f,f,f,f,...,f,f,f,0.0,0.0,91.0,0.92,99.0,SVI,normal
7792,47,F,f,f,f,f,f,f,f,f,...,f,f,f,0.0,0.0,75.0,0.85,88.0,other,normal


In [11]:
sex_map = {'F': 0, 'M': 1}
obj_col_map = {'f': 0, 't': 1}
target_map = {'hyperthyroid': 0, 'hypothyroid': 1, 'normal': 2}

df3['sex_encoded'] = df3['sex'].map(sex_map)

binary_cols = ['on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_meds', 
               'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 
               'query_hypothyroid', 'query_hyperthyroid', 'lithium', 
               'goitre', 'tumor', 'hypopituitary', 'psych']

for col in binary_cols:
    df3[f'{col}_encoded'] = df3[col].map(obj_col_map)

df3['target_category_encoded'] = df3['target_category'].map(target_map)

In [12]:
df3

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,thyroid_surgery_encoded,I131_treatment_encoded,query_hypothyroid_encoded,query_hyperthyroid_encoded,lithium_encoded,goitre_encoded,tumor_encoded,hypopituitary_encoded,psych_encoded,target_category_encoded
0,29,F,f,f,f,f,f,f,f,t,...,0,0,1,0,0,0,0,0,0,2
1,29,F,f,f,f,f,f,f,f,f,...,0,0,0,0,0,0,0,0,0,2
2,41,F,f,f,f,f,f,f,f,f,...,0,0,0,1,0,0,0,0,0,2
3,36,F,f,f,f,f,f,f,f,f,...,0,0,0,0,0,0,0,0,0,2
4,32,F,f,f,f,f,f,f,f,f,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7789,70,F,f,f,f,f,f,f,f,f,...,0,0,0,0,0,0,0,0,0,2
7790,56,M,f,f,f,f,f,f,f,f,...,0,0,0,1,0,0,0,0,0,2
7791,22,M,f,f,f,f,f,f,f,f,...,0,0,0,0,0,0,0,0,0,2
7792,47,F,f,f,f,f,f,f,f,f,...,0,0,0,0,0,0,0,0,0,2


In [13]:
obj_col = []
for i in df3.columns:
    if df3[i].dtype == 'object':
        print(f'Unique values in {i}: {df3[i].unique()}')
        obj_col.append(i)

Unique values in sex: ['F' 'M']
Unique values in on_thyroxine: ['f' 't']
Unique values in query_on_thyroxine: ['f' 't']
Unique values in on_antithyroid_meds: ['f' 't']
Unique values in sick: ['f' 't']
Unique values in pregnant: ['f' 't']
Unique values in thyroid_surgery: ['f' 't']
Unique values in I131_treatment: ['f' 't']
Unique values in query_hypothyroid: ['t' 'f']
Unique values in query_hyperthyroid: ['f' 't']
Unique values in lithium: ['f' 't']
Unique values in goitre: ['f' 't']
Unique values in tumor: ['f' 't']
Unique values in hypopituitary: ['f']
Unique values in psych: ['f' 't']
Unique values in referral_source: ['other' 'SVI' 'SVHC' 'STMW' 'SVHD' 'WEST']
Unique values in target_category: ['normal' 'hyperthyroid' 'hypothyroid']


In [14]:
df3.columns

Index(['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_meds', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U',
       'FTI', 'referral_source', 'target_category', 'sex_encoded',
       'on_thyroxine_encoded', 'query_on_thyroxine_encoded',
       'on_antithyroid_meds_encoded', 'sick_encoded', 'pregnant_encoded',
       'thyroid_surgery_encoded', 'I131_treatment_encoded',
       'query_hypothyroid_encoded', 'query_hyperthyroid_encoded',
       'lithium_encoded', 'goitre_encoded', 'tumor_encoded',
       'hypopituitary_encoded', 'psych_encoded', 'target_category_encoded'],
      dtype='object')

In [15]:
df3.drop(columns=['sex','on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_meds', 
               'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 
               'query_hypothyroid', 'query_hyperthyroid', 'lithium', 
               'goitre', 'tumor', 'hypopituitary', 'psych','target_category'], inplace=True)
df3.columns

Index(['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral_source',
       'sex_encoded', 'on_thyroxine_encoded', 'query_on_thyroxine_encoded',
       'on_antithyroid_meds_encoded', 'sick_encoded', 'pregnant_encoded',
       'thyroid_surgery_encoded', 'I131_treatment_encoded',
       'query_hypothyroid_encoded', 'query_hyperthyroid_encoded',
       'lithium_encoded', 'goitre_encoded', 'tumor_encoded',
       'hypopituitary_encoded', 'psych_encoded', 'target_category_encoded'],
      dtype='object')

In [17]:
df3['referral_source'].unique()

array(['other', 'SVI', 'SVHC', 'STMW', 'SVHD', 'WEST'], dtype=object)

In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df3['referral_source'] = le.fit_transform(df3['referral_source'])

In [25]:
df3.columns

Index(['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral_source',
       'sex_encoded', 'on_thyroxine_encoded', 'query_on_thyroxine_encoded',
       'on_antithyroid_meds_encoded', 'sick_encoded', 'pregnant_encoded',
       'thyroid_surgery_encoded', 'I131_treatment_encoded',
       'query_hypothyroid_encoded', 'query_hyperthyroid_encoded',
       'lithium_encoded', 'goitre_encoded', 'tumor_encoded',
       'hypopituitary_encoded', 'psych_encoded', 'target_category_encoded'],
      dtype='object')

In [26]:
df3['referral_source'].unique()

array([5, 3, 1, 0, 2, 4])

In [27]:
df3 = df3.rename(columns={'sex_encoded': 'sex',})

In [30]:
df3 = df3.rename(columns={'on_thyroxine_encoded':'on_thyroxine', 'query_on_thyroxine_encoded':'query_on_thyroxine',
       'on_antithyroid_meds_encoded':'on_antithyroid_meds', 'sick_encoded':'sick', 'pregnant_encoded':'pregnant',
       'thyroid_surgery_encoded':'thyroid_surgery', 'I131_treatment_encoded':'I131_treatment',
       'query_hypothyroid_encoded':'query_hypothyroid', 'query_hyperthyroid_encoded':'query_hyperthyroid',
       'lithium_encoded':'lithium', 'goitre_encoded':'goitre', 'tumor_encoded':'tumor',
       'hypopituitary_encoded':'hypopituitary', 'psych_encoded':'psych', 'target_category_encoded':'target'})

In [31]:
df3.columns

Index(['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral_source', 'sex',
       'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_meds', 'sick',
       'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid',
       'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary',
       'psych', 'target'],
      dtype='object')

In [32]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7794 entries, 0 to 7793
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  7794 non-null   int64  
 1   TSH                  7794 non-null   float64
 2   T3                   7794 non-null   float64
 3   TT4                  7794 non-null   float64
 4   T4U                  7794 non-null   float64
 5   FTI                  7794 non-null   float64
 6   referral_source      7794 non-null   int64  
 7   sex                  7794 non-null   int64  
 8   on_thyroxine         7794 non-null   int64  
 9   query_on_thyroxine   7794 non-null   int64  
 10  on_antithyroid_meds  7794 non-null   int64  
 11  sick                 7794 non-null   int64  
 12  pregnant             7794 non-null   int64  
 13  thyroid_surgery      7794 non-null   int64  
 14  I131_treatment       7794 non-null   int64  
 15  query_hypothyroid    7794 non-null   i

In [34]:
df3

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,referral_source,sex,on_thyroxine,query_on_thyroxine,...,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,target
0,29,0.3,0.0,0.0,0.00,0.0,5,0,0,0,...,0,0,1,0,0,0,0,0,0,2
1,29,1.6,1.9,128.0,0.00,0.0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,41,0.0,0.0,0.0,0.00,0.0,5,0,0,0,...,0,0,0,1,0,0,0,0,0,2
3,36,0.0,0.0,0.0,0.00,0.0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,32,0.0,0.0,0.0,0.00,0.0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7789,70,0.0,0.0,88.0,0.74,119.0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,2
7790,56,0.0,0.0,64.0,0.83,77.0,3,1,0,0,...,0,0,0,1,0,0,0,0,0,2
7791,22,0.0,0.0,91.0,0.92,99.0,3,1,0,0,...,0,0,0,0,0,0,0,0,0,2
7792,47,0.0,0.0,75.0,0.85,88.0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [35]:
df3.to_csv('thyroid_encoded.csv', index=False)