# Downloading data from the SINAN database

In [8]:
from pysus.online_data import SINAN, FTP_Inspect, parquets_to_dataframe
import pandas as pd

SINAN is a database of reported cases of certain diseases that Brazilian law requires to be reported. Unfortunately the data available for free download, corresponds only to the investigated cases not the totality of the reported cases. Nevertheless it's an interesting dataset.

To find out what are these diseases, we can use PySUS:

In [2]:
SINAN.list_diseases()

['Animais Peçonhentos',
 'Botulismo',
 'Cancer',
 'Chagas',
 'Chikungunya',
 'Colera',
 'Coqueluche',
 'Contact Communicable Disease',
 'Acidentes de Trabalho',
 'Dengue',
 'Difteria',
 'Esquistossomose',
 'Febre Amarela',
 'Febre Maculosa',
 'Febre Tifoide',
 'Hanseniase',
 'Hantavirose',
 'Hepatites Virais',
 'Intoxicação Exógena',
 'Leishmaniose Visceral',
 'Leptospirose',
 'Leishmaniose Tegumentar',
 'Malaria',
 'Meningite',
 'Peste',
 'Poliomielite',
 'Raiva Humana',
 'Sífilis Adquirida',
 'Sífilis Congênita',
 'Sífilis em Gestante',
 'Tétano Acidental',
 'Tétano Neonatal',
 'Tuberculose',
 'Violência Domestica',
 'Zika']

These diseases are available in countrywide tables, so if we want to see the cases of `Chagas` disease in the state of Minas Gerais, first we can check which years are available:

In [3]:
SINAN.get_available_years('Chagas')

['2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020']

We can also check when it was last updated for every disease, and if the table is preliminary or final.

In [4]:
lu = FTP_Inspect('SINAN').last_update_df()
lu

Unnamed: 0,folder,date,file_size,file_name
0,/dissemin/publicos/SINAN/DADOS/FINAIS,2023-01-16 14:15:00,28326,ACBIBR06.dbc
1,/dissemin/publicos/SINAN/DADOS/FINAIS,2023-01-16 14:15:00,673314,ACBIBR07.dbc
2,/dissemin/publicos/SINAN/DADOS/FINAIS,2023-01-16 14:15:00,1048406,ACBIBR08.dbc
3,/dissemin/publicos/SINAN/DADOS/FINAIS,2023-01-16 14:15:00,1493392,ACBIBR09.dbc
4,/dissemin/publicos/SINAN/DADOS/FINAIS,2023-01-16 14:15:00,1632311,ACBIBR10.dbc
...,...,...,...,...
728,/dissemin/publicos/SINAN/DADOS/PRELIM,2023-03-09 16:37:00,169214,VARCBR21.dbc
729,/dissemin/publicos/SINAN/DADOS/PRELIM,2023-03-09 16:37:00,169214,VARCBR22.dbc
730,/dissemin/publicos/SINAN/DADOS/PRELIM,2021-10-15 11:37:00,24793234,VIOLBR20.dbc
731,/dissemin/publicos/SINAN/DADOS/PRELIM,2021-10-15 11:37:00,16021135,VIOLBR21.dbc


In [5]:
lu[lu.file_name.str.startswith('CHAG')]

Unnamed: 0,folder,date,file_size,file_name
77,/dissemin/publicos/SINAN/DADOS/FINAIS,2022-03-28 11:18:00,41075,CHAGBR00.dbc
78,/dissemin/publicos/SINAN/DADOS/FINAIS,2022-03-28 11:18:00,47675,CHAGBR01.dbc
79,/dissemin/publicos/SINAN/DADOS/FINAIS,2022-03-28 11:18:00,69415,CHAGBR02.dbc
80,/dissemin/publicos/SINAN/DADOS/FINAIS,2022-03-28 11:18:00,90539,CHAGBR03.dbc
81,/dissemin/publicos/SINAN/DADOS/FINAIS,2022-03-28 11:18:00,86820,CHAGBR04.dbc
82,/dissemin/publicos/SINAN/DADOS/FINAIS,2022-03-28 11:18:00,223289,CHAGBR05.dbc
83,/dissemin/publicos/SINAN/DADOS/FINAIS,2022-03-28 11:18:00,135953,CHAGBR06.dbc
84,/dissemin/publicos/SINAN/DADOS/FINAIS,2021-11-23 12:21:00,11660,CHAGBR07.dbc
85,/dissemin/publicos/SINAN/DADOS/FINAIS,2021-11-23 12:21:00,11004,CHAGBR08.dbc
86,/dissemin/publicos/SINAN/DADOS/FINAIS,2021-11-23 12:21:00,17913,CHAGBR09.dbc


We can see, that we have data in final form, from 2000 until 2019, and preliminary data for 2020. Now we can download it:

In [9]:
df = parquets_to_dataframe(SINAN.download('Chagas', 2019))
df

Unnamed: 0,TP_NOT,ID_AGRAVO,DT_NOTIFIC,SEM_NOT,NU_ANO,SG_UF_NOT,ID_MUNICIP,ID_REGIONA,ID_UNIDADE,DT_SIN_PRI,...,DT_OBITO,CON_PROVAV,CON_OUTRA,CON_LOCAL,TPAUTOCTO,COUFINF,COPAISINF,COMUNINF,DOENCA_TRA,DT_ENCERRA
0,2,B571,2019-04-10,201915,2019,16,160030,,2019639,2019-03-01,...,,5,,2,1,16,1,160030,2,20190513
1,2,B571,2019-09-16,201938,2019,16,160030,,2022192,2019-08-18,...,,5,,2,2,16,1,160060,2,20191002
2,2,B571,2019-03-07,201910,2019,16,160030,,2022192,2019-02-28,...,,5,,2,1,16,1,160030,,20190325
3,2,B571,2019-10-22,201943,2019,16,160030,,2020653,2019-09-09,...,,5,,2,1,16,1,160030,2,20191107
4,2,B571,2019-09-10,201937,2019,16,160060,,2020971,2019-08-28,...,,5,,2,1,16,1,160060,2,20191108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4477,2,B571,2019-09-05,201936,2019,26,260120,1501,5740592,2019-09-04,...,,,,,,,0,,,20191022
4478,2,B571,2019-09-26,201939,2019,26,261390,1506,2348489,2019-08-26,...,,2,,2,1,26,1,261390,2,20191008
4479,2,B571,2019-01-17,201903,2019,26,260120,1501,2711443,2019-01-05,...,,,,,,,0,,,20190212
4480,2,B571,2019-07-03,201927,2019,26,260820,1498,5276403,2019-07-03,...,,,,,,,0,,,20190903


## Downloading large files
Some SINAN files can be quite large and can take a bit longer to download and convert. As the default behavior is to download data in chunks, some folders may contain lots of parquet chunks

In [13]:
fn = SINAN.download('Dengue', 2020)

The cases of dengue where downloaded to multiple chunks to the directory above

In [15]:
import os
len(os.listdir(fn))

50

In [18]:
df2 = parquets_to_dataframe(fn)
len(df2)

1495117

## Decoding the age in SINAN tables
In SINAN the age comes encoded. PySUS can decode the age column `NU_IDADE_N` into any of these units: years, months, days, or hours.

In [None]:
for i, f in enumerate(glob(f"{fn}/*.parquet")):
    if i == 0:
        df2 = pd.read_parquet(f)
    else:
        df2 = pd.concat([df2, pd.read_parquet(f)], ignore_index=True)
df2

Unnamed: 0,TP_NOT,ID_AGRAVO,DT_NOTIFIC,SEM_NOT,NU_ANO,SG_UF_NOT,ID_MUNICIP,ID_REGIONA,ID_UNIDADE,DT_SIN_PRI,...,LACO_N,PLASMATICO,EVIDENCIA,PLAQ_MENOR,CON_FHD,COMPLICA,TP_SISTEMA,NDUPLIC_N,CS_FLXRET,FLXRECEBI
0,2,A90,2020-06-03,,2020,50,500660,1972,5870178,2020-06-01,...,,,,,,,2,,0,
1,2,A90,2020-04-02,,2020,50,500660,1972,2651610,2020-03-31,...,,,,,,,2,,0,
2,2,A90,2020-05-31,,2020,50,500660,1972,2651610,2020-05-30,...,,,,,,,2,,0,
3,2,A90,2020-09-05,,2020,50,500660,1972,2651610,2020-08-29,...,,,,,,,2,,0,
4,2,A90,2020-04-25,,2020,50,500660,1972,5870178,2020-04-24,...,,,,,,,2,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495112,2,A90,2020-01-28,,2020,32,320530,1510,2675110,2020-01-27,...,,,,,,,2,,0,
1495113,2,A90,2020-02-20,,2020,32,320530,1510,0012173,2020-02-18,...,,,,,,,2,,0,
1495114,2,A90,2020-03-02,,2020,32,320530,1510,0012173,2020-02-23,...,,,,,,,2,,0,
1495115,2,A90,2020-02-17,,2020,32,320530,1510,0028177,2020-02-05,...,,,,,,,2,,0,


## Decoding the age in SINAN tables
In SINAN the age comes encoded. PySUS can decode the age column `NU_IDADE_N` into any of these units: years, months, days, or hours.

In [19]:
from pysus.preprocessing.decoders import decodifica_idade_SINAN
decodifica_idade_SINAN?

[0;31mSignature:[0m       [0mdecodifica_idade_SINAN[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m            vectorize
[0;31mString form:[0m     <numpy.vectorize object at 0x7fe908e44b20>
[0;31mFile:[0m            ~/micromamba/envs/pysus/lib/python3.9/site-packages/numpy/__init__.py
[0;31mDocstring:[0m      
Em tabelas do SINAN frequentemente a idade é representada como um inteiro que precisa ser parseado
para retornar a idade em uma unidade cronológica padrão.
:param unidade: unidade da idade: 'Y': anos, 'M' meses, 'D': dias, 'H': horas
:param idade: inteiro ou sequencia de inteiros codificados.
:return:
[0;31mClass docstring:[0m
vectorize(pyfunc, otypes=None, doc=None, excluded=None, cache=False,
          signature=None)

Generalized function class.

Define a vectorized function which takes a nested sequence of objects or
numpy arrays as inputs and returns a single numpy array or a tu

We can easily convert dates and numerical fields in the dataframe:

In [20]:
for cname in df.columns:
    if cname.startswith('DT_'):
        df[cname] = pd.to_datetime(df[cname], errors='coerce')
    elif cname.startswith('ID_'):
        try:
            df[cname] = pd.to_numeric(df[cname])
        except ValueError:
            continue
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4482 entries, 0 to 4481
Data columns (total 99 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   TP_NOT      4482 non-null   string        
 1   ID_AGRAVO   4482 non-null   string        
 2   DT_NOTIFIC  4482 non-null   datetime64[ns]
 3   SEM_NOT     4482 non-null   string        
 4   NU_ANO      4482 non-null   string        
 5   SG_UF_NOT   4482 non-null   string        
 6   ID_MUNICIP  4482 non-null   int64         
 7   ID_REGIONA  3633 non-null   float64       
 8   ID_UNIDADE  4482 non-null   int64         
 9   DT_SIN_PRI  4482 non-null   datetime64[ns]
 10  SEM_PRI     4482 non-null   string        
 11  DT_NASC     4447 non-null   datetime64[ns]
 12  NU_IDADE_N  4482 non-null   string        
 13  CS_SEXO     4482 non-null   string        
 14  CS_GESTANT  4482 non-null   string        
 15  CS_RACA     4482 non-null   string        
 16  CS_ESCOL_N  4482 non-nul

Let's convert the age to years and save it on a different column.

In [21]:
df['idade_anos'] = decodifica_idade_SINAN(df.NU_IDADE_N, 'Y')
df[['NU_IDADE_N', 'idade_anos']]

Unnamed: 0,NU_IDADE_N,idade_anos
0,4013,13.0
1,4054,54.0
2,4031,31.0
3,4063,63.0
4,4036,36.0
...,...,...
4477,4079,79.0
4478,4063,63.0
4479,4039,39.0
4480,4036,36.0


## Saving the Modified data
We can save our dataframe in any format we wish to avoid having to redo this analysis next time. If we want to keep only the data from the state of Minas Gerais we need to filter the table using the UF code `31`.

In [22]:
df['SG_UF_NOT'] = df.SG_UF_NOT.astype(int)
df[df.SG_UF_NOT==31].to_csv('chagas_SP_2018_mod.csv',sep=';',compression='zip')