In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, QuantileTransformer
from sklearn.impute import SimpleImputer
import numpy as np

In [5]:
import sys
import os
from os.path import dirname
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
sys.path.append(dirname('../src'))
from src.utils import utils
data = utils.load_df("../Data/feature_eng.pkl")
data.head(5)

Unnamed: 0,ventana,sexo,sum_num_consultas,avg_num_consultas_by_year,dm,renal,preglucosa,num_med_preglucosa,avg_preglucosa,min_preglucosa,...,comp_aten_med_qx_no_clasif,num_med_comp_aten_med_qx_no_clasif,avg_comp_aten_med_qx_no_clasif,sec_traumatismos_envenenamiento_causas_ext,num_med_sec_traumatismos_envenenamiento_causas_ext,avg_sec_traumatismos_envenenamiento_causas_ext,ot_efec_causas_ext_comp_traumatismos,num_med_ot_efec_causas_ext_comp_traumatismos,avg_ot_efec_causas_ext_comp_traumatismos,label
0,1,M,25,12.5,0,0,1,9.0,150.0,108.0,...,0,,,0,,,0,,,0
1,2,M,24,12.0,0,0,1,4.0,158.0,109.0,...,0,,,0,,,0,,,0
2,3,M,21,10.5,0,0,1,1.0,253.0,253.0,...,0,,,0,,,0,,,0
3,4,M,20,10.0,0,0,1,4.0,155.75,80.0,...,0,,,0,,,0,,,1
4,1,M,4,2.0,0,0,0,,,,...,0,,,0,,,0,,,0


- Eliminando columnas que no aportan información

In [6]:
subset_df = data.loc[:, data.isnull().all()]
col_null = list(subset_df.columns)
col_null

['num_med_ldl',
 'avg_ldl',
 'min_ldl',
 'max_ldl',
 'num_med_plaquetas',
 'avg_plaquetas',
 'min_plaquetas',
 'max_plaquetas',
 'num_med_tuberculosis',
 'avg_tuberculosis',
 'num_med_ot_enf_bacterianas',
 'avg_ot_enf_bacterianas',
 'num_med_inf_trans_pred_sexual',
 'avg_inf_trans_pred_sexual',
 'num_med_rickettsiosis_y_ot_enf__protozoarios',
 'avg_rickettsiosis_y_ot_enf__protozoarios',
 'num_med_tumores_malig_labio_bucal_faringe',
 'avg_tumores_malig_labio_bucal_faringe',
 'num_med_tumores_malig_organos',
 'avg_tumores_malig_organos',
 'num_med_tumores_malig_org_genitourinarios',
 'avg_tumores_malig_org_genitourinarios',
 'num_med_tumores_malig_otros_sitios_ne',
 'avg_tumores_malig_otros_sitios_ne',
 'num_med_tumores_malig_tejido_linf_org_hematop',
 'avg_tumores_malig_tejido_linf_org_hematop',
 'num_med_tumores_comp_incierto_desc',
 'avg_tumores_comp_incierto_desc',
 'num_med_enf_sangre_org_hematop',
 'avg_enf_sangre_org_hematop',
 'num_med_desnutricion_ot_deficiencias',
 'avg_desnutr

In [7]:
data.drop(col_null, axis=1, inplace=True)
list(data.columns)

['ventana',
 'sexo',
 'sum_num_consultas',
 'avg_num_consultas_by_year',
 'dm',
 'renal',
 'preglucosa',
 'num_med_preglucosa',
 'avg_preglucosa',
 'min_preglucosa',
 'max_preglucosa',
 'postglucosa',
 'num_med_postglucosa',
 'avg_postglucosa',
 'min_postglucosa',
 'max_postglucosa',
 'colesterol',
 'num_med_colesterol',
 'avg_colesterol',
 'min_colesterol',
 'max_colesterol',
 'trigliceridos',
 'num_med_trigliceridos',
 'avg_trigliceridos',
 'min_trigliceridos',
 'max_trigliceridos',
 'hdl',
 'num_med_hdl',
 'avg_hdl',
 'min_hdl',
 'max_hdl',
 'ldl',
 'presion_arterial',
 'num_med_presion_a',
 'avg_sistolica_a',
 'min_sistolica_a',
 'max_sistolica_a',
 'avg_diastolica_a',
 'min_diastolica_a',
 'max_diastolica_a',
 'hba1c',
 'num_med_hba1c',
 'avg_hba1c',
 'min_hba1c',
 'max_hba1c',
 'plaquetas',
 'creatinina',
 'num_med_creatinina',
 'avg_creatinina',
 'min_creatinina',
 'max_creatinina',
 'acido_urico',
 'num_med_acido_urico',
 'avg_acido_urico',
 'min_acido_urico',
 'max_acido_urico