# Predikcia 

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
from sklearn.pipeline import Pipeline

In [2]:
pd.options.display.width = None
pd.options.display.max_columns = None

In [3]:
from Models import CombineDatasets as d1
from Models import DecodeJSONColumn as d2
from Models import ColumnDropper as d3
from Models import BinaryEncoder as d4
from Models import ClassFixer as d5
from Models import FixDates as d6
from Models import ComputeCurYear as d7
from Models import Replacer as d8
from Models import FillMeasuredColumns as d9
from Models import CountMeasuredHormones as d10
from Models import FillNanNumeric as d11
from Models import PredictNanNumeric as d12
from Models import Classifier as d13
from Models import QuantileReplacer as d14

In [4]:
bool_columns = [
    'FTI measured',
    'I131 treatment',
    'T3 measured',
    'goitre',
    'lithium',
    'on thyroxine',
    'pregnant',
    'tumor',
    'query on thyroxine',
    'thyroid surgery',
    'TSH measured',
    'query hyperthyroid',
    'TT4 measured',
    'on antithyroid medication',
    'sick',
    'T4U measured',
    'psych',
    'query hypothyroid'
]

In [5]:
ppl = Pipeline([
    ('combination', d1.CombineDatasets()),
    ('json_decode', d2.DecodeJSONColumn('medical_info')),
    ('drop_columns', d3.ColumnDropper(['hypopituitary', 'TBG measured', 'TBG'])),
    ('binary_fixer', d4.BinaryEncoder(bool_columns)),
    ('class_fix', d5.ClassFixer()),
    ('fix_dates', d6.FixDates()),
    ('compute_cur_year', d7.ComputeCurYear()),
    ('fti_question_mark_to_nan2', d8.Replacer(['FTI'], '?.4', float('nan'), str)),
    ('fti_question_mark_to_nan', d8.Replacer(['FTI'], '?', float('nan'), str)),
    ('fti_question_mark_to_nan1', d8.Replacer(['FTI'], np.nan, float('nan'), str)),
    ('fti_question_mark_to_nan3', d8.Replacer(['FTI'], np.nan, float('nan'), float)),
    ('fill_measured_columns', d9.FillMeasuredColumns()),
    ('count_measured', d10.CountMeasuredHormones('measured_hormones')),
    
    # Fill hormony
    ('fill_tt4', d11.FillNanNumeric('mean', 'TT4')),
    ('fill_tsh', d11.FillNanNumeric('median', 'TSH')),
    ('predicted_t3', d12.PredictNanNumeric('T3', ['TT4', 'TSH'])),
    ('predicted_t4u', d12.PredictNanNumeric('T4U', ['TT4', 'TSH'])),
    ('predicted_fti', d12.PredictNanNumeric('FTI', ['TT4', 'TSH', 'T4U'])),
    
    # Fill pregnant
    ('pregnant_predict', d13.Classifier('pregnant', ['T3', 'T4U', 'TT4'])),
    
    # Fill other variables
    ('fill_tumor', d11.FillNanNumeric('median', 'tumor')),
    ('fill_lithium', d11.FillNanNumeric('mean', 'lithium')),
    ('fill_on_thyroxine', d11.FillNanNumeric('mean', 'on thyroxine')),
    ('fill_goitre', d11.FillNanNumeric('most_frequent', 'goitre')),
    ('fill_q_on_thyroxine', d11.FillNanNumeric('mean', 'query on thyroxine')),
    
    #Replace outlier
    ('replace_t3', d14.QuantileReplacer('T3')),
    ('replace_age', d14.QuantileReplacer('age')),
    ('replace_t4u', d14.QuantileReplacer('T4U')),
    ('replace_tsh', d14.QuantileReplacer('TSH')),
    ('replace_tt4', d14.QuantileReplacer('TT4')),
    ('replace_fti', d14.QuantileReplacer('FTI')),
    
])

In [6]:
data_personal_train = pd.read_csv("data/personal_train.csv")
data_other_train = pd.read_csv("data/other_train.csv")

In [7]:
data_train = ppl.fit_transform(data_personal_train, data_other_train)

In [8]:
data_train

Unnamed: 0,Unnamed: 0_x,name,address,age,sex,date_of_birth,FTI measured,I131 treatment,T3,T3 measured,T4U,TSH,TT4,Unnamed: 0_y,capital-gain,capital-loss,class,education,education-num,fnlwgt,goitre,hours-per-week,lithium,marital-status,native-country,occupation,on thyroxine,pregnant,query on thyroxine,race,referral source,relationship,thyroid surgery,tumor,workclass,TSH measured,query hyperthyroid,TT4 measured,on antithyroid medication,FTI,sick,T4U measured,psych,query hypothyroid,cur_year,measured_hormones
0,0,Terry Terry,"11818 Lori Crossing Apt. 802\nPughstad, DC 78165",24.0,M,1994-05-02,0,0,0.527957,1,-0.201824,-2.083723,34.383398,1476.0,0.0,0.0,negative,HS-grad,9.0,144844.0,0,40.0,0.0,Married-civ-spouse,United-States,Craft-repair,0.0,0,0.0,White,SVHC,Husband,0,0.0,Private,1,0,1,0,43.352879,0,0,1,0,2018.0,3
1,1,Stephen Lalk,"PSC 4657, Box 5446\nAPO AP 58412",44.0,M,1974-04-29,1,0,0.279809,1,-0.060155,0.714785,29.984892,444.0,0.0,0.0,negative,Bachelors,13.0,335067.0,0,50.0,0.0,Never-married,United-States,Sales,0.0,0,0.0,White,SVHD,Not-in-family,0,0.0,Private,1,0,1,0,38.389782,1,1,0,0,2018.0,5
2,2,Abraham Bruce,Unit 9759 Box 9470\nDPO AP 45549,56.0,M,1962-06-09,0,0,0.365518,0,-0.407999,0.341517,26.105408,1658.0,0.0,0.0,negative,Prof-school,15.0,186035.0,0,30.0,0.0,Married-civ-spouse,United-States,Sales,0.0,0,0.0,White,other,Husband,0,0.0,Self-emp-not-inc,0,0,0,0,39.998206,0,0,0,0,2018.0,0
3,3,Edith Boudreaux,"137 Lewis Flat Suite 762\nWest Elizabeth, AL 3...",71.0,F,1946-12-16,1,0,0.680247,1,0.158978,0.479887,29.984892,2203.0,0.0,0.0,negative,Bachelors,13.0,36601.0,0,50.0,0.0,Never-married,United-States,Prof-specialty,0.0,0,0.0,White,SVI,Not-in-family,0,0.0,Private,1,0,1,0,33.148454,1,1,0,1,2017.0,5
4,4,Janet Washington,"995 Frank Stravenue\nSouth Matthewport, TX 81402",63.0,F,1955-10-15,1,0,0.680247,1,-0.151005,-0.393484,25.749649,2275.0,0.0,0.0,negative,HS-grad,9.0,29810.0,0,50.0,0.0,Never-married,United-States,Other-service,0.0,0,0.0,White,other,Not-in-family,0,0.0,Private,1,0,1,0,34.876007,0,1,0,0,2018.0,5
5,5,Margaret Chabot,"1781 Meredith Skyway Suite 328\nCordovaburgh, ...",63.0,F,1955-05-22,0,1,0.527957,1,-0.407999,1.116333,24.818168,1775.0,0.0,0.0,negative,Some-college,10.0,518530.0,0,40.0,0.0,Never-married,United-States,Adm-clerical,1.0,0,0.0,White,Other,Own-child,0,0.0,Private,1,0,1,0,42.368953,0,0,0,0,2018.0,3
6,6,James Wilson,"2415 Elizabeth Knoll Suite 030\nCordovafort, C...",70.0,M,1948-05-03,1,0,0.000000,1,-0.120636,1.356137,18.673102,859.0,0.0,0.0,negative,Doctorate,16.0,343721.0,0,30.0,0.0,Never-married,?,Prof-specialty,0.0,0,0.0,White,SVI,Not-in-family,0,0.0,Self-emp-not-inc,1,0,1,0,23.459589,0,1,0,0,2018.0,5
7,7,Roy Wilson,"49824 Kim View\nWest Ericborough, VT 42457",54.0,?,1964-06-07,1,0,0.753125,1,-0.010004,1.038081,26.359021,1261.0,4650.0,0.0,negative,12th,8.0,192704.0,0,50.0,0.0,Never-married,United-States,Exec-managerial,0.0,0,0.0,White,SVI,Not-in-family,0,0.0,Private,1,0,1,0,32.168984,0,1,0,0,2018.0,5
8,8,Colleen Satterwhite,"53231 Matthew Spur Apt. 079\nNorth Brian, MA 3...",48.0,F,1970-06-07,1,0,1.276393,1,0.198423,0.603298,28.281215,1913.0,0.0,0.0,negative,Some-college,10.0,153167.0,0,40.0,0.0,Never-married,United-States,?,0.0,0,0.0,Black,SVI,Own-child,0,0.0,?,1,0,1,0,30.569906,0,1,0,0,2018.0,5
9,9,Judy Smith,USNV Gallegos\nFPO AA 99743,55.0,F,1963-06-02,1,0,1.026530,1,-0.030038,0.603298,24.344118,2071.0,0.0,0.0,negative,12th,8.0,176321.0,0,40.0,0.0,Never-married,Mexico,Adm-clerical,0.0,0,0.0,White,SVI,Own-child,0,0.0,Private,1,0,1,0,29.959430,0,1,0,0,2018.0,5


In [9]:
data_personal_test = pd.read_csv("data/personal_test.csv")
data_other_test = pd.read_csv("data/other_test.csv")

In [10]:
data_test = ppl.fit_transform(data_personal_test, data_other_test)

In [11]:
data_test[data_test['FTI'].isnull()]

Unnamed: 0,Unnamed: 0_x,name,address,age,sex,date_of_birth,FTI measured,I131 treatment,T3,T3 measured,T4U,TSH,TT4,Unnamed: 0_y,capital-gain,capital-loss,education,education-num,fnlwgt,goitre,hours-per-week,lithium,marital-status,native-country,occupation,on thyroxine,pregnant,query on thyroxine,race,referral source,relationship,thyroid surgery,tumor,workclass,TSH measured,query hyperthyroid,TT4 measured,on antithyroid medication,FTI,sick,T4U measured,psych,query hypothyroid,cur_year,measured_hormones


In [12]:
data_personal_valid = pd.read_csv("data/personal_valid.csv")
data_other_valid = pd.read_csv("data/other_valid.csv")

In [13]:
data_valid =  ppl.fit_transform(data_personal_valid, data_other_valid)

In [14]:
data_valid

Unnamed: 0,Unnamed: 0_x,name,address,age,sex,date_of_birth,FTI measured,I131 treatment,T3,T3 measured,T4U,TSH,TT4,Unnamed: 0_y,capital-gain,capital-loss,class,education,education-num,fnlwgt,goitre,hours-per-week,lithium,marital-status,native-country,occupation,on thyroxine,pregnant,query on thyroxine,race,referral source,relationship,thyroid surgery,tumor,workclass,TSH measured,query hyperthyroid,TT4 measured,on antithyroid medication,FTI,sick,T4U measured,psych,query hypothyroid,cur_year,measured_hormones
0,0,Walter Acosta,"55866 Nicholson View\nTraviston, LA 41078",52.077098,M,2063-07-13,1,0.0,0.518665,1,-0.069735,0.475444,38.032950,514.0,0.0,0.0,negative,Bachelors,13.0,234690.0,0,40.0,0.0,Married-civ-spouse,Cuba,Adm-clerical,0.0,0,0.0,White,other,Husband,0,0.0,Private,1,0.0,1,0.0,32.562146,0.0,1,0.0,0.0,2118,5
1,1,Frances Pryor,"68243 Nathan Pines Suite 570\nChristinaberg, S...",24.307230,F,1992-04-06,1,0.0,1.492780,1,0.283847,-1.547763,39.954271,842.0,0.0,0.0,increased binding protein,Some-college,10.0,155977.0,0,54.0,0.0,Widowed,United-States,?,1.0,1,0.0,Black,other,Unmarried,0,0.0,?,1,0.0,1,0.0,27.548782,0.0,1,0.0,0.0,2018,5
2,2,Bernadette Depaolo,"876 Nancy Flats Suite 185\nLake Carolynton, TX...",75.868316,F,1936-04-17,1,0.0,0.360945,1,0.080331,0.264054,42.071281,1007.0,0.0,0.0,negative,Bachelors,,313852.0,0,40.0,0.0,Divorced,United-States,Prof-specialty,0.0,0,0.0,White,SVI,Not-in-family,0,0.0,Local-gov,1,1.0,1,0.0,32.205355,0.0,1,0.0,0.0,2018,5
3,3,Gloria Lepley,"36893 Garcia Walk Apt. 689\nBelindastad, AR 60040",60.654183,F,1953-12-20,1,0.0,0.441350,1,-0.049866,1.229446,35.073266,726.0,0.0,0.0,negative,HS-grad,9.0,225135.0,0,40.0,0.0,Never-married,United-States,Machine-op-inspct,0.0,0,0.0,White,SVHC,Unmarried,0,0.0,Private,1,0.0,1,0.0,29.832204,0.0,1,0.0,0.0,2017,5
4,4,Esther Beaty,"3805 Cheryl Meadows Apt. 731\nHarveyport, KY 3...",73.018638,F,1940-11-29,1,0.0,-0.213180,1,-0.029952,1.291912,26.043103,355.0,0.0,0.0,negative,HS-grad,9.0,129802.0,0,12.0,0.0,Widowed,United-States,?,0.0,0,0.0,White,SVI,Not-in-family,0,0.0,?,1,0.0,1,0.0,22.212346,1.0,1,0.0,1.0,2017,5
5,5,Clara Mustard,"4677 Cody Shoal\nBergertown, MD 75030",56.844029,F,1958-07-16,1,0.0,-0.213180,1,0.191813,3.038761,22.547721,272.0,0.0,0.0,negative,10th,6.0,140169.0,0,40.0,0.0,Separated,Mexico,Other-service,1.0,0,0.0,White,other,Unmarried,0,0.0,,1,0.0,1,0.0,21.332194,0.0,1,0.0,0.0,2018,5
6,6,Jesus Reichel,"9077 Mendez View\nStokesborough, VA 21063",55.891030,?,1959-09-03,1,1.0,1.054454,1,-0.029952,-1.340371,36.811657,626.0,0.0,0.0,negative,Some-college,10.0,538243.0,0,40.0,0.0,Separated,United-States,Tech-support,0.0,0,0.0,Black,other,Unmarried,0,0.0,Private,1,0.0,1,0.0,30.756471,0.0,1,0.0,0.0,2018,5
7,7,John Nichols,"484 Jeremy Dale Apt. 588\nDelgadoport, KS 97822",44.439155,?,1971-06-06,1,0.0,0.734860,1,-0.069735,-3.092159,37.302143,291.0,0.0,0.0,negative,Some-college,10.0,433624.0,0,40.0,0.0,Never-married,United-States,Adm-clerical,0.0,0,0.0,White,other,Not-in-family,0,0.0,Private,1,0.0,1,0.0,31.846442,1.0,1,0.0,0.0,2018,5
8,8,Everett Guarino,"444 David Brook\nNew Jennatown, MS 14540",50.168920,M,1965-08-14,1,0.0,0.931805,1,-0.168384,-2.786735,31.746483,134.0,7298.0,0.0,negative,Doctorate,16.0,230329.0,0,40.0,0.0,Married-civ-spouse,United-States,Prof-specialty,0.0,0,0.0,White,other,Husband,0,0.0,State-gov,1,0.0,1,1.0,29.645523,0.0,1,0.0,0.0,2018,5
9,9,Elizabeth Waters,"337 Samantha Viaduct\nMichellechester, NJ 13415",65.412949,F,1949-05-22,1,0.0,-0.213180,1,-0.138915,3.038761,22.547721,978.0,0.0,0.0,negative,Some-college,10.0,252818.0,0,40.0,0.0,Never-married,United-States,Prof-specialty,0.0,0,0.0,White,SVI,Not-in-family,0,0.0,State-gov,1,0.0,1,0.0,21.332194,0.0,1,0.0,0.0,2018,5
