<a href="https://colab.research.google.com/github/GA239/DS_course/blob/master/HW7/tsk1/hypothyroid_preprocessed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [200]:
import pandas as pd
import os
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures

In [201]:
# read dataset
DRIVE = '/content/drive/My Drive'
hypothyroid = os.path.join(DRIVE, 'dataset_57_hypothyroid.csv')

In [202]:
df = pd.read_csv(hypothyroid)
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
0,41,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1.3,t,2.5,t,125,t,1.14,t,109,f,?,SVHC,negative
1,23,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,4.1,t,2,t,102,f,?,f,?,f,?,other,negative
2,46,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.98,f,?,t,109,t,0.91,t,120,f,?,other,negative
3,70,F,t,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.16,t,1.9,t,175,f,?,f,?,f,?,other,negative
4,70,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.72,t,1.2,t,61,t,0.87,t,70,f,?,SVI,negative


In [203]:
TARGET_COLUMN = 'Class'

In [204]:
df = df[df[TARGET_COLUMN] != 'secondary_hypothyroid']

In [205]:
df = pd.concat([df, pd.get_dummies(df.Class, prefix=TARGET_COLUMN)], axis=1)
df.drop([TARGET_COLUMN], axis=1, inplace=True)

In [206]:
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class_compensated_hypothyroid,Class_negative,Class_primary_hypothyroid
0,41,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1.3,t,2.5,t,125,t,1.14,t,109,f,?,SVHC,0,1,0
1,23,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,4.1,t,2,t,102,f,?,f,?,f,?,other,0,1,0
2,46,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.98,f,?,t,109,t,0.91,t,120,f,?,other,0,1,0
3,70,F,t,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.16,t,1.9,t,175,f,?,f,?,f,?,other,0,1,0
4,70,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.72,t,1.2,t,61,t,0.87,t,70,f,?,SVI,0,1,0


In [207]:
df.shape

(3770, 32)

In [208]:
df = df.drop_duplicates()

In [209]:
df.shape

(3709, 32)

In [210]:
df = df.replace(to_replace={'f':0,'t':1, 'y':1, 'n':0, '?': np.nan, 'M':0, 'F':1})
df['referral_source'] = LabelEncoder().fit_transform(df['referral_source'])

In [211]:
numeric_columns = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']
categorial_columns = list(set(data.columns) - set(numeric_columns) - {TARGET_COLUMN})

In [212]:
df[numeric_columns] = df[numeric_columns].astype('float64', errors='ignore')
df[categorial_columns] = df[categorial_columns].astype('category', errors='ignore')

In [213]:
df = df.drop(columns=['TBG_measured', 'TBG'])

In [214]:
target_columns = [TARGET_COLUMN + i for i in ['_compensated_hypothyroid',	'_negative',	'_primary_hypothyroid']]
X = df.drop(columns=target_columns)
Y = df[target_columns]

In [215]:
features_most_frequent = ['sex', 'age']
features_mean_or_median = ['TSH', 'T3', 'TT4', 'T4U', 'FTI']
other_features = list(set(X.columns) - set(features_most_frequent + features_mean_or_median)) 

In [216]:
imputer = ColumnTransformer(transformers=[
    ('imputer_most_frequent', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), features_most_frequent),
    ('imputer_mean_or_median', SimpleImputer(missing_values=np.nan, strategy='median'), features_mean_or_median),
    ('imputer_other', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), other_features)
])

In [217]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [218]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape 

((2967, 27), (742, 27), (2967, 3), (742, 3))

In [219]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
Y_train.reset_index(drop=True, inplace=True)
Y_test.reset_index(drop=True, inplace=True)

X_columns = X_train.columns
Y_columns = Y_train.columns

In [220]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape 

((2967, 27), (742, 27), (2967, 3), (742, 3))

In [221]:
X_train.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,referral_source
0,66.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.32,1,2.5,1,169.0,1,1.14,1,149.0,3
1,68.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,6.7,1,1.4,1,101.0,0,,0,,3
2,76.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.69,1,2.3,1,138.0,1,1.04,1,133.0,3
3,82.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.5,0,,1,125.0,1,0.84,1,149.0,4
4,37.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.25,1,2.0,1,114.0,1,0.9,1,128.0,3


In [222]:
Y_train

Unnamed: 0,Class_compensated_hypothyroid,Class_negative,Class_primary_hypothyroid
0,0,1,0
1,1,0,0
2,0,1,0
3,0,1,0
4,0,1,0
...,...,...,...
2962,0,1,0
2963,0,1,0
2964,1,0,0
2965,0,1,0


In [223]:
imputer.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('imputer_most_frequent',
                                 SimpleImputer(add_indicator=False, copy=True,
                                               fill_value=None,
                                               missing_values=nan,
                                               strategy='most_frequent',
                                               verbose=0),
                                 ['sex', 'age']),
                                ('imputer_mean_or_median',
                                 SimpleImputer(add_indicator=False, copy=True,
                                               fill_value=None,
                                               mi...
                                               strategy='most_frequent',
                                               verbose=0),
                                 ['lithium', 'qu

In [224]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [225]:
standart_scaler = RobustScaler()
standart_scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [226]:
X_train = standart_scaler.transform(X_train)
X_test = standart_scaler.transform(X_test)

In [227]:
X_train.shape

(2967, 27)

In [231]:
p_featuers = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True, order='C')
p_featuers.fit(X_train)

X_train = p_featuers.transform(X_train)
X_test = p_featuers.transform(X_test)

In [234]:
X_test = pd.DataFrame(X_test)
X_train = pd.DataFrame(X_train)

In [235]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377
0,0.662491,0.701606,-0.191478,0.655774,1.715805,0.773602,1.199889,-0.066339,-0.262305,0.303999,0.21578,-0.202614,-0.156578,-0.110826,-0.099351,0.487645,0.309261,-0.130923,-0.119829,-0.233176,-0.018362,-0.251286,-0.122691,-0.122691,-0.382682,-0.262305,0.307951,0.464808,-0.126853,0.434444,1.136706,0.512505,0.794915,-0.043949,-0.173775,0.201397,0.142952,-0.13423,-0.103731,-0.073422,...,0.016063,0.050102,0.034342,-0.040318,0.027941,0.0022,0.030111,0.014702,0.014702,0.045856,0.031432,-0.036901,0.004282,0.058594,0.028609,0.028609,0.089232,0.061163,-0.071807,0.004614,0.002253,0.002253,0.007027,0.004816,-0.005655,0.03083,0.03083,0.096162,0.065914,-0.077384,0.015053,0.046951,0.032182,-0.037783,0.046951,0.032182,-0.037783,0.100379,-0.117847,-0.080777
1,-1.509455,0.800348,0.075313,-0.820912,-0.209401,-0.081537,-0.102384,-0.066339,-0.262305,0.303999,0.21578,4.935481,-0.156578,-0.110826,-0.099351,0.487645,-3.233512,-0.130923,-0.119829,-0.233176,-0.018362,-0.251286,-0.122691,-0.122691,-0.382682,-0.262305,-3.247268,-1.208089,-0.113681,1.23913,0.316081,0.123076,0.154544,0.100135,0.395938,-0.458872,-0.32571,-7.449885,0.236347,0.167287,...,0.016063,0.050102,0.034342,0.425143,0.027941,0.0022,0.030111,0.014702,0.014702,0.045856,0.031432,0.389117,0.004282,0.058594,0.028609,0.028609,0.089232,0.061163,0.757185,0.004614,0.002253,0.002253,0.007027,0.004816,0.059626,0.03083,0.03083,0.096162,0.065914,0.815992,0.015053,0.046951,0.032182,0.39841,0.046951,0.032182,0.39841,0.100379,1.242669,0.851775
2,-1.509455,1.195313,-0.176006,0.387286,0.838138,0.239141,0.703785,-0.066339,-0.262305,0.303999,0.21578,-0.202614,-0.156578,-0.110826,-0.099351,0.487645,0.309261,-0.130923,-0.119829,-0.233176,-0.018362,-0.251286,-0.122691,-0.122691,-0.382682,-0.262305,0.307951,-1.804271,0.265673,-0.58459,-1.265131,-0.360972,-1.062331,0.100135,0.395938,-0.458872,-0.32571,0.305837,0.236347,0.167287,...,0.016063,0.050102,0.034342,-0.040318,0.027941,0.0022,0.030111,0.014702,0.014702,0.045856,0.031432,-0.036901,0.004282,0.058594,0.028609,0.028609,0.089232,0.061163,-0.071807,0.004614,0.002253,0.002253,0.007027,0.004816,-0.005655,0.03083,0.03083,0.096162,0.065914,-0.077384,0.015053,0.046951,0.032182,-0.037783,0.046951,0.032182,-0.037783,0.100379,-0.117847,-0.080777
3,0.662491,1.491537,-0.183951,-0.015447,0.470084,-0.829783,1.199889,-0.066339,-0.262305,0.303999,0.21578,-0.202614,-0.156578,-0.110826,-0.099351,-2.050674,0.309261,-0.130923,-0.119829,-0.233176,-0.018362,0.664642,-0.122691,-0.122691,-0.382682,-0.262305,0.307951,0.98813,-0.121866,-0.010233,0.311426,-0.549724,0.794915,-0.043949,-0.173775,0.201397,0.142952,-0.13423,-0.103731,-0.073422,...,0.016063,0.050102,0.034342,-0.040318,0.027941,0.0022,-0.079643,0.014702,0.014702,0.045856,0.031432,-0.036901,0.004282,-0.154978,0.028609,0.028609,0.089232,0.061163,-0.071807,-0.012204,0.002253,0.002253,0.007027,0.004816,-0.005655,-0.081545,-0.081545,-0.254346,-0.174339,0.204677,0.015053,0.046951,0.032182,-0.037783,0.046951,0.032182,-0.037783,0.100379,-0.117847,-0.080777
4,0.662491,-0.730144,-0.194405,-0.015447,0.158653,-0.509106,0.548752,-0.066339,-0.262305,0.303999,0.21578,-0.202614,-0.156578,-0.110826,-0.099351,0.487645,0.309261,-0.130923,-0.119829,4.288608,-0.018362,-0.251286,-0.122691,-0.122691,-0.382682,-0.262305,0.307951,-0.483714,-0.128792,-0.010233,0.105106,-0.337278,0.363543,-0.043949,-0.173775,0.201397,0.142952,-0.13423,-0.103731,-0.073422,...,0.016063,0.050102,0.034342,-0.040318,-0.513899,0.0022,0.030111,0.014702,0.014702,0.045856,0.031432,-0.036901,-0.078746,-1.077666,-0.526173,-0.526173,-1.641171,-1.124924,1.320682,0.004614,0.002253,0.002253,0.007027,0.004816,-0.005655,0.03083,0.03083,0.096162,0.065914,-0.077384,0.015053,0.046951,0.032182,-0.037783,0.046951,0.032182,-0.037783,0.100379,-0.117847,-0.080777


In [236]:
X_train.to_csv(os.path.join(DRIVE, 'X_train.csv'))
X_test.to_csv(os.path.join(DRIVE, 'X_test.csv'))
Y_train.to_csv(os.path.join(DRIVE, 'Y_train.csv'))
Y_test.to_csv(os.path.join(DRIVE, 'Y_test.csv'))

In [237]:
Xtr = pd.read_csv(os.path.join(DRIVE, 'X_train.csv'), index_col=0)

In [239]:
Xtr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377
0,0.662491,0.701606,-0.191478,0.655774,1.715805,0.773602,1.199889,-0.066339,-0.262305,0.303999,0.21578,-0.202614,-0.156578,-0.110826,-0.099351,0.487645,0.309261,-0.130923,-0.119829,-0.233176,-0.018362,-0.251286,-0.122691,-0.122691,-0.382682,-0.262305,0.307951,0.464808,-0.126853,0.434444,1.136706,0.512505,0.794915,-0.043949,-0.173775,0.201397,0.142952,-0.13423,-0.103731,-0.073422,...,0.016063,0.050102,0.034342,-0.040318,0.027941,0.0022,0.030111,0.014702,0.014702,0.045856,0.031432,-0.036901,0.004282,0.058594,0.028609,0.028609,0.089232,0.061163,-0.071807,0.004614,0.002253,0.002253,0.007027,0.004816,-0.005655,0.03083,0.03083,0.096162,0.065914,-0.077384,0.015053,0.046951,0.032182,-0.037783,0.046951,0.032182,-0.037783,0.100379,-0.117847,-0.080777
1,-1.509455,0.800348,0.075313,-0.820912,-0.209401,-0.081537,-0.102384,-0.066339,-0.262305,0.303999,0.21578,4.935481,-0.156578,-0.110826,-0.099351,0.487645,-3.233512,-0.130923,-0.119829,-0.233176,-0.018362,-0.251286,-0.122691,-0.122691,-0.382682,-0.262305,-3.247268,-1.208089,-0.113681,1.23913,0.316081,0.123076,0.154544,0.100135,0.395938,-0.458872,-0.32571,-7.449885,0.236347,0.167287,...,0.016063,0.050102,0.034342,0.425143,0.027941,0.0022,0.030111,0.014702,0.014702,0.045856,0.031432,0.389117,0.004282,0.058594,0.028609,0.028609,0.089232,0.061163,0.757185,0.004614,0.002253,0.002253,0.007027,0.004816,0.059626,0.03083,0.03083,0.096162,0.065914,0.815992,0.015053,0.046951,0.032182,0.39841,0.046951,0.032182,0.39841,0.100379,1.242669,0.851775
2,-1.509455,1.195313,-0.176006,0.387286,0.838138,0.239141,0.703785,-0.066339,-0.262305,0.303999,0.21578,-0.202614,-0.156578,-0.110826,-0.099351,0.487645,0.309261,-0.130923,-0.119829,-0.233176,-0.018362,-0.251286,-0.122691,-0.122691,-0.382682,-0.262305,0.307951,-1.804271,0.265673,-0.58459,-1.265131,-0.360972,-1.062331,0.100135,0.395938,-0.458872,-0.32571,0.305837,0.236347,0.167287,...,0.016063,0.050102,0.034342,-0.040318,0.027941,0.0022,0.030111,0.014702,0.014702,0.045856,0.031432,-0.036901,0.004282,0.058594,0.028609,0.028609,0.089232,0.061163,-0.071807,0.004614,0.002253,0.002253,0.007027,0.004816,-0.005655,0.03083,0.03083,0.096162,0.065914,-0.077384,0.015053,0.046951,0.032182,-0.037783,0.046951,0.032182,-0.037783,0.100379,-0.117847,-0.080777
3,0.662491,1.491537,-0.183951,-0.015447,0.470084,-0.829783,1.199889,-0.066339,-0.262305,0.303999,0.21578,-0.202614,-0.156578,-0.110826,-0.099351,-2.050674,0.309261,-0.130923,-0.119829,-0.233176,-0.018362,0.664642,-0.122691,-0.122691,-0.382682,-0.262305,0.307951,0.98813,-0.121866,-0.010233,0.311426,-0.549724,0.794915,-0.043949,-0.173775,0.201397,0.142952,-0.13423,-0.103731,-0.073422,...,0.016063,0.050102,0.034342,-0.040318,0.027941,0.0022,-0.079643,0.014702,0.014702,0.045856,0.031432,-0.036901,0.004282,-0.154978,0.028609,0.028609,0.089232,0.061163,-0.071807,-0.012204,0.002253,0.002253,0.007027,0.004816,-0.005655,-0.081545,-0.081545,-0.254346,-0.174339,0.204677,0.015053,0.046951,0.032182,-0.037783,0.046951,0.032182,-0.037783,0.100379,-0.117847,-0.080777
4,0.662491,-0.730144,-0.194405,-0.015447,0.158653,-0.509106,0.548752,-0.066339,-0.262305,0.303999,0.21578,-0.202614,-0.156578,-0.110826,-0.099351,0.487645,0.309261,-0.130923,-0.119829,4.288608,-0.018362,-0.251286,-0.122691,-0.122691,-0.382682,-0.262305,0.307951,-0.483714,-0.128792,-0.010233,0.105106,-0.337278,0.363543,-0.043949,-0.173775,0.201397,0.142952,-0.13423,-0.103731,-0.073422,...,0.016063,0.050102,0.034342,-0.040318,-0.513899,0.0022,0.030111,0.014702,0.014702,0.045856,0.031432,-0.036901,-0.078746,-1.077666,-0.526173,-0.526173,-1.641171,-1.124924,1.320682,0.004614,0.002253,0.002253,0.007027,0.004816,-0.005655,0.03083,0.03083,0.096162,0.065914,-0.077384,0.015053,0.046951,0.032182,-0.037783,0.046951,0.032182,-0.037783,0.100379,-0.117847,-0.080777
