# Tratamento dos Dados

In [46]:
import pandas as pd
from unidecode import unidecode

In [27]:
# url = 'https://bi.static.es.gov.br/covid19/MICRODADOS.csv'
url = 'MICRODADOS.zip'
df = pd.read_csv(url, sep = ';', encoding = 'cp1252')
df.head()

Unnamed: 0,DataNotificacao,DataCadastro,DataDiagnostico,DataColeta_RT_PCR,DataColetaTesteRapido,DataColetaSorologia,DataColetaSorologiaIGG,DataEncerramento,DataObito,Classificacao,...,FicouInternado,ViagemBrasil,ViagemInternacional,ProfissionalSaude,PossuiDeficiencia,MoradorDeRua,ResultadoRT_PCR,ResultadoTesteRapido,ResultadoSorologia,ResultadoSorologia_IGG
0,2021-04-18,2021-04-18,2021-04-14,,,,,,,Suspeito,...,Não Informado,Não,Não,Não,Não,Não,Não Informado,Não Informado,Não Informado,Não Informado
1,2021-04-18,2021-04-13,2021-04-17,,2021-04-18,,,2021-04-16,,Confirmados,...,Não Informado,Não Informado,Não Informado,Não,Não,Não,Não Informado,Positivo,Não Informado,Não Informado
2,2021-04-18,2021-04-18,2021-04-15,,,,,,,Suspeito,...,Não,Sim,Não Informado,Não,Não,Não,Não Informado,Não Informado,Não Informado,Não Informado
3,2021-04-18,2021-04-18,2021-04-13,,,,,,,Suspeito,...,Não Informado,Não Informado,Não Informado,Não,Não,Não,Não Informado,Não Informado,Não Informado,Não Informado
4,2021-04-18,2021-04-18,2021-04-13,2021-04-18,,,,,,Suspeito,...,Não,Não,Não,Ignorado,Não,Não,Não Informado,Não Informado,Não Informado,Não Informado


In [29]:
df.describe().T

Unnamed: 0,count,unique,top,freq
DataNotificacao,1276154,418,2021-03-22,9054
DataCadastro,1276154,388,2021-03-24,8566
DataDiagnostico,1276154,458,2021-03-15,7767
DataColeta_RT_PCR,642069,505,2021-03-22,5089
DataColetaTesteRapido,388160,463,2020-10-21,3977
DataColetaSorologia,77000,395,2020-07-20,963
DataColetaSorologiaIGG,119492,411,2020-07-28,2728
DataEncerramento,1074020,464,2021-04-05,8027
DataObito,11337,383,2021-04-05,85
Classificacao,1276154,3,Descartados,616524


In [30]:
print('Classificacão: {}'.format(df['Classificacao'].unique()))
print('Evolução: {}'.format(df['Evolucao'].unique()))

Classificacão: ['Suspeito' 'Confirmados' 'Descartados']
Evolução: ['-' 'Ignorado' 'Cura' 'Óbito pelo COVID-19' 'Óbito por outras causas']


In [31]:
total_casos = df.query('Classificacao == "Confirmados"').shape[0]
total_obitos = df.query('Evolucao == "Óbito pelo COVID-19"').shape[0]
total_curas = df.query('Evolucao == "Cura"').shape[0]

print('TOTAIS')
print('=================')
print('  - CASOS: {}'.format(total_casos))
print('  - ÓBITOS: {} ({}%)'.format(total_obitos, round((total_obitos * 100) / total_casos, 1)))
print('  - CURA: {} ({}%)'.format(total_curas, round((total_curas * 100) / total_casos, 1)))

TOTAIS
  - CASOS: 416932
  - ÓBITOS: 8673 (2.1%)
  - CURA: 390552 (93.7%)


In [32]:
df['DataNotificacao'] = pd.to_datetime(df['DataNotificacao'])
df.sort_values('DataNotificacao', inplace=True)
df = df.query('Classificacao == "Confirmados"').reset_index(drop=True)

## Padroniza nome de municípios e bairros

In [33]:
df['Municipio'] = df['Municipio'].apply(lambda x: unidecode(str(x)).upper())
df['Bairro'] = df['Bairro'].apply(lambda x: unidecode(str(x)).upper())

## Cálculo de Confirmados

In [34]:
df['ConfirmadosAcumulado'] = df[['Municipio', 'Bairro', 'DataNotificacao']]\
    .groupby(['Municipio', 'Bairro'])\
    .cumcount() + 1

df['Confirmados'] = df[['Municipio', 'Bairro', 'DataNotificacao']]\
    .groupby(['Municipio', 'Bairro', 'DataNotificacao'])\
    .cumcount() + 1

## Cálculo de Óbitos

In [35]:
df['Obitos'] = df['Evolucao'].apply(lambda evolucao: 1 if evolucao == 'Óbito pelo COVID-19' else 0)

df['ObitosAcumulado'] = df[['Municipio', 'Bairro', 'DataNotificacao', 'Obitos']]\
    .groupby(['Municipio', 'Bairro'])\
    .cumsum()

df['Obitos'] = df[['Municipio', 'Bairro', 'DataNotificacao', 'Obitos']]\
    .groupby(['Municipio', 'Bairro', 'DataNotificacao'])\
    .cumsum()

## Cálculo de Curas

In [36]:
df['Curas'] = df['Evolucao'].apply(lambda evolucao: 1 if evolucao == 'Cura' else 0)

df['CurasAcumulado'] = df[['Municipio', 'Bairro', 'DataNotificacao', 'Curas']]\
    .groupby(['Municipio', 'Bairro'])\
    .cumsum()

df['Curas'] = df[['Municipio', 'Bairro', 'DataNotificacao', 'Curas']]\
    .groupby(['Municipio', 'Bairro', 'DataNotificacao'])\
    .cumsum()

## Contagem de Casos, Óbitos e Curas

In [37]:
grupo_base = ['DataNotificacao', 'Municipio', 'Bairro']

datas = df[['DataNotificacao']].drop_duplicates().reset_index(drop=True)
municipios_bairros = df[['Municipio', 'Bairro']].drop_duplicates().reset_index(drop=True)

datas['key'] = 0
municipios_bairros['key'] = 0

In [38]:
df_counts = pd.merge(datas, municipios_bairros, how='outer')[grupo_base]

df_counts = df_counts.merge(
    df[[
        'Municipio',
        'Bairro',
        'DataNotificacao',
        'Confirmados',
        'Obitos',
        'Curas',
        'ConfirmadosAcumulado',
        'ObitosAcumulado',
        'CurasAcumulado'
    ]].drop_duplicates(grupo_base, keep='last')\
        .dropna(),
    on=grupo_base,
    how='left'
)

df_counts.fillna(
    {
        'Confirmados': 0,
        'Obitos': 0,
        'Curas': 0,
    },
    inplace=True
)

columns_ffill = ['ConfirmadosAcumulado', 'ObitosAcumulado', 'CurasAcumulado']

df_counts[columns_ffill] = df_counts.groupby(['Municipio', 'Bairro'])\
    .fillna(method='ffill')\
    .fillna(0)\
    [columns_ffill]

In [39]:
df_counts

Unnamed: 0,DataNotificacao,Municipio,Bairro,Confirmados,Obitos,Curas,ConfirmadosAcumulado,ObitosAcumulado,CurasAcumulado
0,2020-02-29,VILA VELHA,ITAPUA,1.0,0.0,1.0,1.0,0.0,1.0
1,2020-02-29,VILA VELHA,PRAIA DE ITAPARICA,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-02-29,VILA VELHA,PRAIA DA COSTA,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-02-29,LINHARES,INTERLAGOS,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-02-29,VITORIA,JARDIM DA PENHA,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1949295,2021-04-18,CAPIXABA,NAN,0.0,0.0,0.0,1.0,0.0,0.0
1949296,2021-04-18,JATI,NAN,0.0,0.0,0.0,1.0,0.0,0.0
1949297,2021-04-18,LARANJA DA TERRA,CORREGO DO MACHADINHO,0.0,0.0,0.0,1.0,0.0,0.0
1949298,2021-04-18,VILA VALERIO,FAZENDA CEOLIN,0.0,0.0,0.0,1.0,0.0,0.0


In [40]:
columns_sum = ['Confirmados', 'Obitos', 'Curas']
df_counts_by_week = df_counts.groupby(['Municipio', 'Bairro', pd.Grouper(key='DataNotificacao', freq='W-MON')])[columns_sum]\
    .sum()\
    .reset_index()\
    .sort_values('DataNotificacao')

df_counts_by_week['ConfirmadosAcumulado'] = df_counts_by_week[['Municipio', 'Bairro', 'DataNotificacao', 'Confirmados']]\
    .groupby(['Municipio', 'Bairro'])\
    .cumsum()

df_counts_by_week['ObitosAcumulado'] = df_counts_by_week[['Municipio', 'Bairro', 'DataNotificacao', 'Obitos']]\
    .groupby(['Municipio', 'Bairro'])\
    .cumsum()

df_counts_by_week['CurasAcumulado'] = df_counts_by_week[['Municipio', 'Bairro', 'DataNotificacao', 'Curas']]\
    .groupby(['Municipio', 'Bairro'])\
    .cumsum()

In [41]:
df_counts['Confirmados'].sum()

416932.0

In [42]:
columns_sum = ['Confirmados', 'Obitos', 'Curas']
df_counts_by_week = df_counts.groupby(['Municipio', 'Bairro', pd.Grouper(key='DataNotificacao', freq='W-MON')])[columns_sum]\
    .sum()\
    .reset_index()\
    .sort_values('DataNotificacao')

df_counts_by_week['ConfirmadosAcumulado'] = df_counts_by_week[['Municipio', 'Bairro', 'DataNotificacao', 'Confirmados']]\
    .groupby(['Municipio', 'Bairro'])\
    .cumsum()

df_counts_by_week['ObitosAcumulado'] = df_counts_by_week[['Municipio', 'Bairro', 'DataNotificacao', 'Obitos']]\
    .groupby(['Municipio', 'Bairro'])\
    .cumsum()

df_counts_by_week['CurasAcumulado'] = df_counts_by_week[['Municipio', 'Bairro', 'DataNotificacao', 'Curas']]\
    .groupby(['Municipio', 'Bairro'])\
    .cumsum()

In [43]:
df_counts_by_week

Unnamed: 0,Municipio,Bairro,DataNotificacao,Confirmados,Obitos,Curas,ConfirmadosAcumulado,ObitosAcumulado,CurasAcumulado
0,ABADIA DOS DOURADOS,NAN,2020-03-02,0.0,0.0,0.0,0.0,0.0,0.0
50460,BREJETUBA,VARGEM ALTA,2020-03-02,0.0,0.0,0.0,0.0,0.0,0.0
237300,SAO LUIS,NAN,2020-03-02,0.0,0.0,0.0,0.0,0.0,0.0
50520,BREJETUBA,VARGEM GRANDE,2020-03-02,0.0,0.0,0.0,0.0,0.0,0.0
237240,SAO LEOPOLDO,NAN,2020-03-02,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
219719,SANTA MARIA DE JETIBA,ALTO JETIBA,2021-04-19,0.0,0.0,0.0,4.0,0.0,4.0
67499,CARIACICA,SOTEMA,2021-04-19,1.0,0.0,0.0,206.0,1.0,190.0
219779,SANTA MARIA DE JETIBA,ALTO RECREIO,2021-04-19,0.0,0.0,0.0,31.0,0.0,31.0
219839,SANTA MARIA DE JETIBA,ALTO RIO BONITO,2021-04-19,0.0,0.0,0.0,10.0,0.0,10.0


In [44]:
ultima_data = df_counts_by_week['DataNotificacao'].max()
penultima_data = df_counts_by_week.query(f'DataNotificacao != "{ultima_data}"')['DataNotificacao'].max()
# penultima_data
df_counts_by_week.query(f'DataNotificacao == "{penultima_data}"')['Confirmados'].sum()

10322.0

In [45]:
df_counts_by_week.to_csv('microdados_pre-processed.csv', sep=',', index=False, encoding='UTF-8')