In [1]:
import polars as pl

In [None]:
# Lista de colunas de datas para converter
date_cols = ['DT_NOTIFIC', 'DT_SIN_PRI', 'DT_INVEST', 'DT_ALRM', 'DT_GRAV']

# Lista de Sintomas e Comorbidades (Campos que geralmente são 1=Sim, 2=Não)
sintomas_cols = [
    'FEBRE', 'MIALGIA', 'CEFALEIA', 'EXANTEMA', 'VOMITO', 'NAUSEA', 
    'DOR_COSTAS', 'CONJUNTVIT', 'ARTRITE', 'ARTRALGIA', 'PETEQUIA_N', 
    'LEUCOPENIA', 'DOR_RETRO'
]

comorbidades_cols = [
    'DIABETES', 'HEMATOLOG', 'HEPATOPAT', 'RENAL', 'HIPERTENSA', 
    'ACIDO_PEPT', 'AUTO_IMUNE'
]

sinais_alarme_cols = [
    'ALRM_HIPOT', 'ALRM_PLAQ', 'ALRM_VOM', 'ALRM_SANG', 'ALRM_HEMAT', 
    'ALRM_ABDOM', 'ALRM_LETAR', 'ALRM_HEPAT', 'ALRM_LIQ'
]

cols_binarias = [
    'ALRM_HIPOT', 'ALRM_PLAQ', 'ALRM_VOM', 'ALRM_SANG', 'ALRM_HEMAT', 
    'ALRM_ABDOM', 'ALRM_LETAR', 'ALRM_HEPAT', 'ALRM_LIQ',
    'GRAV_PULSO', 'GRAV_CONV', 'GRAV_ENCH', 'GRAV_INSUF', 'GRAV_TAQUI',
    'GRAV_EXTRE', 'GRAV_HIPOT', 'GRAV_HEMAT', 'GRAV_MELEN', 'GRAV_METRO',
    'GRAV_SANG', 'GRAV_AST', 'GRAV_MIOC', 'GRAV_CONSC', 'GRAV_ORGAO'
]



In [3]:
df = pl.read_csv("./data/dengue_tratado.csv", null_values=['', ' ', 'nan'], ignore_errors=True)
df.shape

(9752880, 85)

In [4]:
cols_existentes = [c for c in cols_binarias  if c in df.columns]

In [None]:
df[].head()

In [None]:
df_clean = df.with_columns([
    pl.col(col).str.to_date(format="%Y%m%d", strict=False).alias(col)
    for col in df.columns if col.startswith("DT_")
]).with_columns([
    # Regra do SINAN: 
    # Começa com 4: Anos (4030 = 30 anos)
    # Começa com 3: Meses (3005 = 5 meses -> 0 anos)
    # Começa com 2: Dias  (2010 = 10 dias -> 0 anos)
    pl.col("NU_IDADE_N").cast(pl.Int32, strict=False),
    pl.when(pl.col("NU_IDADE_N") >= 4000)
    .then(pl.col("NU_IDADE_N") - 4000)
    .when(pl.col("NU_IDADE_N") >= 3000)
    .then(0)
    .when(pl.col("NU_IDADE_N") >= 2000)
    .then(0)
    .otherwise(None)
    .alias("IDADE_ANOS")
]).with_columns([
    pl.col("IDADE_ANOS").cut(
        breaks=[10, 20, 40, 60, 80],
        labels=["0-10", "11-20", "21-40", "41-60", "61-80", "80+"]
    ).cast(pl.Utf8).alias("FAIXA_ETARIA"),
    
    (pl.col("DT_INVEST") - pl.col("DT_SIN_PRI")).dt.total_days().fill_null(0).clip(lower_bound=0).alias("DIAS_SINTOMA_INVESTIGACAO"),
    (pl.col("DT_INVEST") - pl.col("DT_ALRM")).dt.total_days().fill_null(0).clip(lower_bound=0).alias("DIAS_ALARME_INVESTIGACAO"),
    (pl.col("DT_INVEST") - pl.col("DT_GRAV")).dt.total_days().fill_null(0).clip(lower_bound=0).alias("DIAS_GRAVE_INVESTIGACAO"),
    
    pl.when(pl.col("CS_GESTANT").is_in([1, 2, 3, 4]))
      .then(1)
      .otherwise(0)
      .alias("IS_GESTANTE"),
    pl.col("CS_SEXO").cast(pl.Categorical)
   
]).with_columns([
    pl.when(pl.col(c).cast(pl.Int32, strict=False) == 1).then(1).otherwise(0).alias(f"BIN_{c}")
    for c in cols_existentes
])

In [6]:
if 'df' in globals():
    del df

df_final = df_clean.with_columns([
    pl.when(pl.col("EVOLUCAO").is_in([2, 3]))
    .then(1)
    .otherwise(0)
    .alias("TARGET_OBITO")
])

if 'df_clean' in globals():
    del df_clean
  

In [None]:
  
cols_to_keep = [
    "TARGET_OBITO", "IDADE_ANOS", "FAIXA_ETARIA", "DIAS_SINTOMA_INVESTIGACAO", "DIAS_ALRME_INVESTIGACAO", "DIAS_GRAVE_INVESTIGACAO", "CS_SEXO", "SG_UF", "IS_GESTANTE"
] + [f"BIN_{c}" for c in cols_binarias  if c in df_final.columns] + sintomas_cols + comorbidades_cols

df_model = df_final.select(cols_to_keep)

if 'df_final' in globals():
    del df_final

In [9]:
df_model.tail(10)

TARGET_OBITO,IDADE_ANOS,FAIXA_ETARIA,DIAS_SINTOMA_INVESTIGACAO,DIAS_ALRME_INVESTIGACAO,DIAS_GRAVE_INVESTIGACAO,CS_SEXO,SG_UF,IS_GESTANTE,BIN_ALRM_HIPOT,BIN_ALRM_PLAQ,BIN_ALRM_VOM,BIN_ALRM_SANG,BIN_ALRM_HEMAT,BIN_ALRM_ABDOM,BIN_ALRM_LETAR,BIN_ALRM_HEPAT,BIN_ALRM_LIQ,BIN_GRAV_PULSO,BIN_GRAV_CONV,BIN_GRAV_ENCH,BIN_GRAV_INSUF,BIN_GRAV_TAQUI,BIN_GRAV_EXTRE,BIN_GRAV_HIPOT,BIN_GRAV_HEMAT,BIN_GRAV_MELEN,BIN_GRAV_METRO,BIN_GRAV_SANG,BIN_GRAV_AST,BIN_GRAV_MIOC,BIN_GRAV_CONSC,BIN_GRAV_ORGAO,ALRM_HIPOT,ALRM_PLAQ,ALRM_VOM,ALRM_SANG,ALRM_HEMAT,ALRM_ABDOM,ALRM_LETAR,ALRM_HEPAT,ALRM_LIQ,DIABETES,HEMATOLOG,HEPATOPAT,RENAL,HIPERTENSA,ACIDO_PEPT,AUTO_IMUNE
i32,i64,str,i64,i64,i64,cat,i64,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64
0,19,"""11-20""",0,0,0,"""M""",35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",2,2,2,2,2,2,2
0,39,"""21-40""",0,0,0,"""M""",35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",2,2,2,2,2,2,2
0,45,"""41-60""",0,0,0,"""F""",35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",2,2,2,2,2,2,2
0,40,"""21-40""",0,0,0,"""M""",35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",2,2,2,2,2,2,2
0,33,"""21-40""",0,0,0,"""F""",35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",2,2,2,2,2,2,2
0,53,"""41-60""",0,0,0,"""M""",35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",2,2,2,2,1,2,2
0,9,"""0-10""",0,0,0,"""M""",35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",2,2,2,2,2,2,2
0,53,"""41-60""",0,0,0,"""F""",35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",2,2,2,2,2,2,2
0,1,"""0-10""",0,0,0,"""M""",35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",2,2,2,2,2,2,2
0,32,"""21-40""",0,0,0,"""M""",35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",1,2,2,2,2,2,2
