
### Integração dos Dados das bases Hypo e Hyper
 
 
As duas bases de dados(Hypo/Hyper) contém 29 atributos, a classe e o ID de cada paciente. 
O id do paciente é repetido nas duas bases, mudando apenas a classificação.
 
Iremos definidir o problema em descobrir se o paciente tem algum problema na Tireoide (Hypo / Hyper) ou se o mesmo é saudavel. Um paciente saudavel é aquele contem "Negative" em ambas as bases, caso contrario o paciente será definido como doente.



In [1]:
import numpy as np
import pandas as pd

### Lendo as bases de dados

In [2]:
# Nomes das colunas
names = ["age", "sex", "on_thyroxine", "query_on_thyroxine", 
         "on_antithyroid_medication", "sick", "pregnant", "thyroid_surgery", 
         "I131_treatment", "query_hypothyroid", "query_hyperthyroid", 
         "lithium", "goitre", "tumor", "hypopituitary", "psych", 
         "TSH_measured", "TSH", "T3_measured", "T3", "TT4_measured",
         "TT4", "T4U_measured", "T4U", "FTI_measured", "FTI", "TBG_measured",
         "TBG", "referral_source", "target"]

In [3]:
def read_dataset(file):
    """
    Ler a base de dados
    """
    ds = pd.read_csv(file, 
                     sep=",",
                     names=names,
                     header=None)
    
    return ds

In [4]:
hypo_data = read_dataset("./data/allhypo.data")
hypo_test = read_dataset("./data/allhypo.test")
hyper_data = read_dataset("./data/allhyper.data")
hyper_test = read_dataset("./data/allhyper.test")

In [5]:
hypo_data.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,negative.|3733
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,negative.|1442
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,negative.|2965
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,negative.|806
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,negative.|2807


In [6]:
hyper_data.head(5)

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,negative.|3733
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,negative.|1442
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,negative.|2965
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,negative.|806
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,negative.|2807


##### Ajustando os valores das ultimas colunas negative.|2807 -> negative 2807
 

In [7]:
def apply_id_target(df):
    """
    Separando os valores da ultima coluna
    """
    df_tmp = df["target"].copy()
    
    df["target"] = df_tmp.apply(lambda x: x.split(".")[0])
    df["id"] = df_tmp.apply(lambda x: int(x.split("|")[1]))
    
apply_id_target(hyper_data)
apply_id_target(hyper_test)
apply_id_target(hypo_data)
apply_id_target(hypo_test)

### Checando a integridade das bases

Checar se as duas bases tem os mesmos Ids e se para cada id(row) temos os mesmos valores para cada feature.

In [23]:
# Copiando os valores para numpy arry
hyper_values = hyper_data.to_numpy(copy=True)
hypo_values = hypo_data.to_numpy(copy=True)
# Sort com base no id
hyper_values = hyper_values[hyper_values[:,30].argsort()]
hypo_values = hypo_values[hypo_values[:,30].argsort()]
# Deletar Column Target
hyper_values = np.delete(hyper_values, 29, 1)
hypo_values = np.delete(hypo_values, 29, 1)
# Agora testamos se realmente todos os valores das duas bases são Iguais
test = hypo_values == hyper_values
test.all()

True

### Integrando as bases

Agora iremos integras as duas bases de dados Hyper_data e Hypo_data

In [26]:
tyroid_data = pd.merge(hyper_data, hypo_data[['id', 'target']], on='id')
tyroid_data.head(5)

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target_x,id,target_y
0,41,F,f,f,f,f,f,f,f,f,...,t,1.14,t,109,f,?,SVHC,negative,3733,negative
1,23,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,other,negative,1442,negative
2,46,M,f,f,f,f,f,f,f,f,...,t,0.91,t,120,f,?,other,negative,2965,negative
3,70,F,t,f,f,f,f,f,f,f,...,f,?,f,?,f,?,other,negative,806,negative
4,70,F,f,f,f,f,f,f,f,f,...,t,0.87,t,70,f,?,SVI,negative,2807,negative


In [27]:
tyroid_test = pd.merge(hyper_test, hypo_test[['id', 'target']], on='id')
tyroid_test.head(5)

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target_x,id,target_y
0,35,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,other,negative,219,negative
1,63,M,f,f,f,f,f,f,f,f,...,t,0.96,t,113,f,?,SVI,negative,2059,negative
2,25,F,f,f,f,f,f,f,f,f,...,t,0.82,t,75,f,?,SVHD,negative,399,negative
3,53,F,f,f,f,f,f,f,f,t,...,t,1.03,t,141,f,?,other,negative,1911,negative
4,92,F,f,f,f,f,f,f,f,f,...,t,0.84,t,143,f,?,SVI,negative,487,negative


### Salvando a nova base

In [30]:
tyroid_data.to_csv("./data/preprocessados/tyroid_integrado_data", index=False)
tyroid_test.to_csv("./data/preprocessados/tyroid_integrado_test", index=False)