In [1]:
from ftplib import FTP
import os
import pandas as pd
from pysus.online_data.SIA import download, show_datatypes
from pysus.utilities.readdbc import dbf_to_csvgz



In [2]:
%matplotlib inline

## Dataset types
The SIA Information system contains multiple types of datasets we can download with PySUS. These are:


In [3]:
show_datatypes()

{'AB': ('APAC de Cirurgia Bariátrica', 1, 2008),
 'ACF': ('APAC de Confecção de Fístula', 1, 2008),
 'AD': ('APAC de Laudos Diversos', 1, 2008),
 'AM': ('APAC de Medicamentos', 1, 2008),
 'AMP': ('APAC de Acompanhamento Multiprofissional', 1, 2008),
 'AN': ('APAC de Nefrologia', 1, 2008),
 'AQ': ('APAC de Quimioterapia', 1, 2008),
 'AR': ('APAC de Radioterapia', 1, 2008),
 'ATD': ('APAC de Tratamento Dialítico', 1, 2008),
 'BI': ('Boletim de Produção Ambulatorial individualizado', 1, 2008),
 'PA': ('Produção Ambulatorial', 7, 1994),
 'PS': ('RAAS Psicossocial', 1, 2008),
 'SAD': ('RAAS de Atenção Domiciliar', 1, 2008)}


In [None]:
dfSP = download('SP', 2020, 12, group=['PA'])

Downloading PASP2012a.dbc...
Downloading PASP2012b.dbc...


In [5]:
dfSP.head()

NameError: name 'dfSP' is not defined

When a download is split in multiple files as in the case above, PySUS downloads the dbfs directly to the cache path, Without loading the dataframe to memory.

In [2]:
from pysus.online_data import CACHEPATH
import os

dbf_to_csvgz(os.path.join(CACHEPATH, 'PASP2012a.dbf'))

Converting: 2641548it [07:10, 6138.62it/s]


The number of lines in this CSV is still very large and loading it entirely into memory is not a good Idea.

But now that it is on a CSV file you can load just a limited number of lines from it as shown below:

In [8]:
fn = os.path.join(CACHEPATH, 'PASP2012a.csv.gz')
df = pd.read_csv(fn, nrows=10)
df

Unnamed: 0,PA_CODUNI,PA_GESTAO,PA_CONDIC,PA_UFMUN,PA_REGCT,PA_INCOUT,PA_INCURG,PA_TPUPS,PA_TIPPRE,PA_MN_IND,...,PA_CODOCO,PA_FLQT,PA_FLER,PA_ETNIA,PA_VL_CF,PA_VL_CL,PA_VL_INC,PA_SRV_C,PA_INE,PA_NAT_JUR
0,2080273,350000,EP,354780,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,,,1023
1,2080273,350000,EP,354780,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,,,1023
2,2080273,350000,EP,354780,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,,,1023
3,2080338,350000,EP,351880,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,121001.0,,1023
4,2080338,350000,EP,351880,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,121001.0,,1023
5,2080273,350000,EP,354780,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,,,1023
6,2080273,350000,EP,354780,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,,,1023
7,2080273,350000,EP,354780,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,,,1023
8,2090236,350000,EP,350550,7101,0,0,7,0,I,...,1,K,0,,0.0,0.0,0.0,120002.0,,3069
9,2090236,350000,EP,350550,7101,0,0,7,0,I,...,1,K,0,,0.0,0.0,0.0,120002.0,,3069


Another approach, if you need to analyze the entire dataset, is to process it in chunks.

In [9]:
chunks = pd.read_csv(fn, iterator=True, chunksize=1000)

In [10]:
for df in chunks:
    break

df


Unnamed: 0,PA_CODUNI,PA_GESTAO,PA_CONDIC,PA_UFMUN,PA_REGCT,PA_INCOUT,PA_INCURG,PA_TPUPS,PA_TIPPRE,PA_MN_IND,...,PA_CODOCO,PA_FLQT,PA_FLER,PA_ETNIA,PA_VL_CF,PA_VL_CL,PA_VL_INC,PA_SRV_C,PA_INE,PA_NAT_JUR
0,2080273,350000,EP,354780,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,,,1023
1,2080273,350000,EP,354780,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,,,1023
2,2080273,350000,EP,354780,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,,,1023
3,2080338,350000,EP,351880,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,121001.0,,1023
4,2080338,350000,EP,351880,7101,0,0,5,0,M,...,1,K,0,,0.0,0.0,0.0,121001.0,,1023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2705982,350000,EP,351620,7101,0,0,5,0,I,...,1,R,0,,0.0,0.0,0.0,,,3069
996,2716305,350000,EP,354980,7103,0,0,80,0,M,...,1,K,0,,0.0,0.0,0.0,145003.0,,1023
997,2705982,350000,EP,351620,7101,0,0,5,0,I,...,1,K,0,,0.0,0.0,0.0,,,3069
998,2705982,350000,EP,351620,7101,0,0,5,0,I,...,1,K,0,,0.0,0.0,0.0,,,3069


Now that you have converted the `.dbf` to a `.csv.gz` file, you can safely delete the dbf file to save storage space on your computer.
