# Predikcia 

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
from sklearn.pipeline import Pipeline

In [2]:
pd.options.display.width = None
pd.options.display.max_columns = None

In [3]:
from Models import CombineDatasets as d1
from Models import DecodeJSONColumn as d2
from Models import ColumnDropper as d3
from Models import BinaryEncoder as d4
from Models import ClassFixer as d5
from Models import FixDates as d6
from Models import ComputeCurYear as d7
from Models import Replacer as d8
from Models import FillMeasuredColumns as d9
from Models import CountMeasuredHormones as d10
from Models import FillNanNumeric as d11
from Models import PredictNanNumeric as d12
from Models import Classifier as d13
from Models import QuantileReplacer as d14
from Models import Fitter as d15

In [4]:
bool_columns = [
    'FTI measured',
    'I131 treatment',
    'T3 measured',
    'goitre',
    'lithium',
    'on thyroxine',
    'pregnant',
    'tumor',
    'query on thyroxine',
    'thyroid surgery',
    'TSH measured',
    'query hyperthyroid',
    'TT4 measured',
    'on antithyroid medication',
    'sick',
    'T4U measured',
    'psych',
    'query hypothyroid'
]

# Zadefinovanie pipelinov pomocou ktorych budeme spracovavat data

In [5]:
# Transformacna pipelina, ktora robi transformacie nad zadanymi datasetmi
transf_ppl = Pipeline([
    ('combination', d1.CombineDatasets()),
    ('json_decode', d2.DecodeJSONColumn('medical_info')),
    ('drop_columns', d3.ColumnDropper(['hypopituitary', 'TBG measured', 'TBG'])),
    ('binary_fixer', d4.BinaryEncoder(bool_columns)),
    ('class_fix', d5.ClassFixer()),
    ('fix_dates', d6.FixDates()),
    ('compute_cur_year', d7.ComputeCurYear()),
    ('fti_question_mark_to_nan2', d8.Replacer(['FTI'], '?.4', float('nan'), str)),
    ('fti_question_mark_to_nan', d8.Replacer(['FTI'], '?', float('nan'), str)),
    ('fti_question_mark_to_nan1', d8.Replacer(['FTI'], np.nan, float('nan'), str)),
    ('fti_question_mark_to_nan3', d8.Replacer(['FTI'], np.nan, float('nan'), float)),
    ('fill_measured_columns', d9.FillMeasuredColumns()),
    ('count_measured', d10.CountMeasuredHormones('measured_hormones'))])
    
# Pipelina ktora doplna chybajuce hodnoty
pred_ppl = Pipeline([
    # Fill hormony
    ('fitter', d15.Fitter()),
#     ('fill_tt4', d11.FillNanNumeric('mean', 'TT4')),
#     ('fill_tsh', d11.FillNanNumeric('median', 'TSH')),
#     ('predicted_t3', d12.PredictNanNumeric('T3', ['TT4', 'TSH'])),
#     ('predicted_t4u', d12.PredictNanNumeric('T4U', ['TT4', 'TSH'])),
#     ('predicted_fti', d12.PredictNanNumeric('FTI', ['TT4', 'TSH', 'T4U'])),
    
#     # Fill pregnant
#     ('pregnant_predict', d13.Classifier('pregnant', ['T3', 'T4U', 'TT4'])),
    
#     # Fill other variables
#     ('fill_tumor', d11.FillNanNumeric('median', 'tumor')),
#     ('fill_lithium', d11.FillNanNumeric('mean', 'lithium')),
#     ('fill_on_thyroxine', d11.FillNanNumeric('mean', 'on thyroxine')),
#     ('fill_goitre', d11.FillNanNumeric('most_frequent', 'goitre')),
#     ('fill_q_on_thyroxine', d11.FillNanNumeric('mean', 'query on thyroxine')),
    
    #Replace outlier
    ('replace_t3', d14.QuantileReplacer('T3')),
    ('replace_age', d14.QuantileReplacer('age')),
    ('replace_t4u', d14.QuantileReplacer('T4U')),
    ('replace_tsh', d14.QuantileReplacer('TSH')),
    ('replace_tt4', d14.QuantileReplacer('TT4')),
    ('replace_fti', d14.QuantileReplacer('FTI'))
])

## Nacitanie trenovacich data a ich spracovanie

In [6]:
data_personal_train = pd.read_csv("data/personal_train.csv")
data_other_train = pd.read_csv("data/other_train.csv")

In [7]:
ppl = transf_ppl.fit(data_personal_train, data_other_train)
data_train = ppl.transform(None)

In [8]:
ppl2 = pred_ppl.fit_transform(data_train, data_train)

test- 2237
test- 2237
fit
2
2237


TypeError: tuple indices must be integers or slices, not str

## Nacitanie testovacich dat a ich spracovanie

In [9]:
data_personal_test = pd.read_csv("data/personal_test.csv")
data_other_test = pd.read_csv("data/other_test.csv")

In [10]:
data_test = ppl.fit_transform(data_personal_test, data_other_test)

In [11]:
lol = pred_ppl.fit(data_train, data_test)
#sss = lol.transform(None)

test- 2237
test- 564
fit
2
564


TypeError: tuple indices must be integers or slices, not str

In [None]:
len(data_test)

## Nacitanie a spracovanie validacnych dat

In [None]:
data_personal_valid = pd.read_csv("data/personal_valid.csv")
data_other_valid = pd.read_csv("data/other_valid.csv")

In [None]:
data_valid =  ppl.fit_transform(data_personal_valid, data_other_valid)