In [1]:
import os
import time
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict

### We have three .csv datasets, corresponding to the pandemic's three semesters: Jan-Jul 2020, Aug-Dec 2020, and Jan-Jun 2021.

In [2]:
# Loading from /data/original.
load_path = 'data' + os.sep + 'original' + os.sep
title = 'HIST_PAINEL_COVIDBR'
download_date = '08jun2021'

start = time.time()
JanJul2020_original = pd.read_csv(f'{load_path}{title}_2020_Parte1_{download_date}.csv', sep=';', low_memory=False)
AugDec2020_original = pd.read_csv(f'{load_path}{title}_2020_Parte2_{download_date}.csv', sep=',', low_memory=False)
JanJul2021_original = pd.read_csv(f'{load_path}{title}_Parte3_{download_date}.csv', sep=',', low_memory=False)
end = time.time()

print(f'Original csv import time: {end - start} secs')

Original csv import time: 6.118001699447632 secs


### Let us look at a few rows.

In [3]:
JanJul2020_original.loc[4929:4934, : ]

Unnamed: 0,regiao,estado,municipio,coduf,codmun,codRegiaoSaude,nomeRegiaoSaude,data,semanaEpi,populacaoTCU2019,casosAcumulado,casosNovos,obitosAcumulado,obitosNovos,Recuperadosnovos,emAcompanhamentoNovos,interior/metropolitana
4929,Norte,RO,Cabixi,11,110003.0,11006.0,CONE SUL,2020-07-29,31,5312.0,44,3,3,0,,,0.0
4930,Norte,RO,Cabixi,11,110003.0,11006.0,CONE SUL,2020-07-30,31,5312.0,44,0,3,0,,,0.0
4931,Norte,RO,Cabixi,11,110003.0,11006.0,CONE SUL,2020-07-31,31,5312.0,44,0,3,0,,,0.0
4932,Norte,RO,Cacoal,11,110004.0,11002.0,CAFE,2020-03-27,13,85359.0,0,0,0,0,,,0.0
4933,Norte,RO,Cacoal,11,110004.0,11002.0,CAFE,2020-03-28,13,85359.0,0,0,0,0,,,0.0
4934,Norte,RO,Cacoal,11,110004.0,11002.0,CAFE,2020-03-29,14,85359.0,0,0,0,0,,,0.0


### There are over four thousand rows corresponding to aggregate municipaliy data, corresponding to states, regions, and the country as a whole.
### We will process our dataframe in three steps. First, as we will look only at individual municipalities, we will drop aggregate data. Second, as we will look only at mortality rates, we can drop columns pertaining to other topics. Finally, we can combine total deaths ('obitosAcumulado') with total population ('populacaoTCU2019') to obtain mortality rate.
### The cell below executes the first two steps.

In [5]:
def drop_non_municipalities(df):
    return df[df['municipio'].notnull()]

def drop_columns(df):
    new_df = df.loc[ : , ['municipio', 'data', 'populacaoTCU2019', 'obitosAcumulado']]
    new_df.columns = ['City', 'Date', 'Population', 'Total deaths']
    return new_df

JanJul2020_processed = drop_columns(drop_non_municipalities(JanJul2020_original))
AugDec2020_processed = drop_columns(drop_non_municipalities(AugDec2020_original))
JanJul2021_processed = drop_columns(drop_non_municipalities(JanJul2021_original))

JanJul2020_processed.loc[4929:4934, : ]

Unnamed: 0,City,Date,Population,Total deaths
4929,Cabixi,2020-07-29,5312.0,3
4930,Cabixi,2020-07-30,5312.0,3
4931,Cabixi,2020-07-31,5312.0,3
4932,Cacoal,2020-03-27,85359.0,0
4933,Cacoal,2020-03-28,85359.0,0
4934,Cacoal,2020-03-29,85359.0,0


### The following cells execute the third step. As they are very time-consuming, they have been commented out and their result has been pickled and is accessible in /data/intermediary.

In [7]:
def get_death_rate(df):
    death_rate_series = pd.Series([0] * len(df.index))
    for index, row in df.iterrows():
        death_rate = (int(row['Total deaths']) / int(row['Population'])) * 1000000 
        death_rate_series[index] = death_rate
    df.loc[ : , 'Death rate'] = death_rate_series
    return df.loc[:, ['City', 'Date', 'Death rate']]


def process_and_save(df, filename):
    start = time.time()
    df = get_death_rate(df)
    end = time.time()
    print(f'Death rate obtainment time: {end - start} secs')
    save_path = 'data' + os.sep + 'intermediary' + os.sep
    df.to_pickle(f'{save_path}{filename}')

In [8]:
# process_and_save(JanJul2020_processed, 'JanJul2020')
# 441 secs

Death rate obtainment time: 441.23489713668823 secs


In [9]:
# process_and_save(AugDec2020_processed, 'AugDec2020')
# 1087 secs

Death rate obtainment time: 1086.5604689121246 secs


In [10]:
# process_and_save(JanJul2021_processed, 'JanJul2021')
# 868 secs

Death rate obtainment time: 867.9928705692291 secs


In [11]:
save_path = 'data' + os.sep + 'intermediary' + os.sep
JanJul2020_pickled = pd.read_pickle(f'{save_path}JanJul2020')
AugDec2020_pickled = pd.read_pickle(f'{save_path}AugDec2020')
JanJul2021_pickled = pd.read_pickle(f'{save_path}JanJul2021')
JanJul2020_pickled

Unnamed: 0,City,Date,Death rate
4551,Alta Floresta D'Oeste,2020-03-27,0.000000
4552,Alta Floresta D'Oeste,2020-03-28,0.000000
4553,Alta Floresta D'Oeste,2020-03-29,0.000000
4554,Alta Floresta D'Oeste,2020-03-30,0.000000
4555,Alta Floresta D'Oeste,2020-03-31,0.000000
...,...,...,...
714476,Brasília,2020-07-27,444.073296
714477,Brasília,2020-07-28,461.318861
714478,Brasília,2020-07-29,470.604935
714479,Brasília,2020-07-30,478.896072


In [13]:
def get_multi_index(df):
    cities = df['City'].unique()
    dates = df['Date'].unique()
    tuples = list(zip(df['City'], df['Date']))
    return pd.MultiIndex.from_tuples(tuples, names=['City', 'Date'])

start = time.time()
JanJul2020_multi_index = get_multi_index(JanJul2020_pickled)
AugDec2020_multi_index = get_multi_index(AugDec2020_pickled)
JanJul2021_multi_index = get_multi_index(JanJul2021_pickled)
end = time.time()
print(f'MultiIndex obtainment time: {end - start} secs')

MultiIndex obtainment time: 2.113995313644409 secs


In [14]:
def df_constructor(df, multi_index):
    return pd.DataFrame(data=np.array(df['Death rate']), index=multi_index, columns=['Death rate'])

start = time.time()
JanJul2020 = df_constructor(JanJul2020_pickled, JanJul2020_multi_index)
AugDec2020 = df_constructor(AugDec2020_pickled, AugDec2020_multi_index)
JanJul2021 = df_constructor(JanJul2021_pickled, JanJul2021_multi_index)
end = time.time()
print(f'MultiIndex DataFrame obtainment time: {end - start} secs')

MultiIndex DataFrame obtainment time: 0.014006853103637695 secs


In [15]:
JanJul2020

Unnamed: 0_level_0,Unnamed: 1_level_0,Death rate
City,Date,Unnamed: 2_level_1
Alta Floresta D'Oeste,2020-03-27,0.000000
Alta Floresta D'Oeste,2020-03-28,0.000000
Alta Floresta D'Oeste,2020-03-29,0.000000
Alta Floresta D'Oeste,2020-03-30,0.000000
Alta Floresta D'Oeste,2020-03-31,0.000000
...,...,...
Brasília,2020-07-27,444.073296
Brasília,2020-07-28,461.318861
Brasília,2020-07-29,470.604935
Brasília,2020-07-30,478.896072


In [17]:
def get_value(df, place, date):
    return df.loc[place].loc[date]['Death rate']

get_value(JanJul2020, 'Brasília', '2020-07-30')

478.8960715929729

In [19]:
save_path = 'data' + os.sep + 'final' + os.sep
JanJul2020.to_pickle(f'{save_path}JanJul2020')
AugDec2020.to_pickle(f'{save_path}AugDec2020')
JanJul2021.to_pickle(f'{save_path}JanJul2021')