In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np

#### - Read raw csv's

In [2]:
## Read all the csv's from brazil and save in a dictionary

YEAR = "2020","2021",'2022'
SEMESTER = '1', '2'

def get_br_data():
    
    data_br = {}

    for year in YEAR:
        for semester in SEMESTER:
            df_br = pd.read_csv('data/raw/brazil_raw/COVIDBR_{semester}_SEM_{year}.csv'.format(semester=semester,year=year),
                                sep=';', parse_dates=["data"],dayfirst=True)
            data_br.update({f"{year}_{semester}":df_br})
    return data_br

In [3]:
data_br = get_br_data()

  data_br = get_br_data()
  data_br = get_br_data()


####  - Population data treatment

In [4]:
# Brazil population data treatment
df_pop_teste = pd.concat([data_br['2020_1'],data_br['2020_2'],
          data_br['2021_1'],data_br['2021_2'],
          data_br['2022_1'],data_br['2022_2']])

In [5]:
# removing unwated columns

df_pop_teste = df_pop_teste.drop(['regiao', 'municipio','coduf', 'codmun', 'codRegiaoSaude', 'nomeRegiaoSaude','semanaEpi',
              'Recuperadosnovos', 'emAcompanhamentoNovos','interior/metropolitana',
              'data','casosNovos','casosAcumulado', 'obitosNovos','obitosAcumulado'], axis=1) ## Remove unwanted columns
df_pop_teste = df_pop_teste.dropna() ## Remove NaN

In [6]:
# remove duplicates
df_pop_teste = df_pop_teste.drop_duplicates()

In [7]:
# Create area column
df_pop_teste['area'] = df_pop_teste['estado']

In [8]:
# Dictionary of codes and brazilian states
REPLACE_AREA_BR = {
    'AC': 'Acre',
    'AL': 'Alagoas',
    'AP': 'Amapá',
    'AM': 'Amazonas',
    'BA': 'Bahia',
    'CE': 'Ceará',
    'DF': 'Distrito Federal',
    'ES': 'Espírito Santo',
    'GO': 'Goiás',
    'MA': 'Maranhão',
    'MT': 'Mato Grosso',
    'MS': 'Mato Grosso do Sul',
    'MG': 'Minas Gerais',
    'PA': 'Pará',
    'PB': 'Paraíba',
    'PR': 'Paraná',
    'PE': 'Pernambuco',
    'PI': 'Piauí',
    'RJ': 'Rio de Janeiro',
    'RN': 'Rio Grande do Norte',
    'RS': 'Rio Grande do Sul',
    'RO': 'Rondônia',
    'RR': 'Roraima',
    'SC': 'Santa Catarina',
    'SP': 'São Paulo',
    'SE': 'Sergipe',
    'TO': 'Tocantins'
}

In [9]:
# Replacing area codes to full state name
df_pop_teste = df_pop_teste.replace({'area':REPLACE_AREA_BR})

In [10]:
# Add group column 'brazil'
df_pop_teste['group'] = 'brazil'

In [11]:
# Reshaping dataframe
df_popu_bra = {'group': df_pop_teste['group'], 'area': df_pop_teste['area'], 'code': df_pop_teste['estado'],
        'population':df_pop_teste['populacaoTCU2019']}
df_br_popu = pd.DataFrame(df_popu_bra)

In [55]:
# Read all population csv (containing usa and world data)

all_pop = pd.read_csv('data/population.csv')

In [57]:
# Merge with brazil population data
all_pop_df = pd.concat([all_pop,df_br_popu])

In [62]:
# Create new all population csv with brazil data included.

all_pop_df.to_csv('data/population.csv', index= False)

#### Creating brazil_cases csv

In [36]:
## Creating dataframe

data_brasil = get_br_data()

  data_brasil = get_br_data()
  data_brasil = get_br_data()


In [37]:
# remove unwated columns
def remove_na_cases(df):

    df = df.drop(['regiao', 'municipio','coduf', 'codmun', 'codRegiaoSaude', 'nomeRegiaoSaude','semanaEpi','Recuperadosnovos', 'emAcompanhamentoNovos','interior/metropolitana','populacaoTCU2019','casosNovos','obitosAcumulado', 'obitosNovos','populacaoTCU2019'], axis=1) 
    df = df.dropna() ## Remove NaN
    return df

In [38]:
# Selecting Columns
def get_columns(df):
    
    first_col = df.columns[0] # Area change and filtering
    df = df.replace({first_col:REPLACE_AREA_BR})# Area change and filtering
    filt_us = df[first_col] != 'Brasil'# Area change and filtering
    df = df[filt_us]# Area change and filtering
    df['data'] =pd.to_datetime(df['data'], format='%d-%m-%Y')
    return df

In [39]:
## Pivoting and index reset

def df_cleaning_cases(df):
    
    df = df.pivot(index='estado', columns='data', values='casosAcumulado')
    df = df.T
    df = df.reset_index()
    df = df.rename(columns = {'data':'Province_State'})
    df['Province_State'] = df['Province_State'].astype(str)
    return df

In [40]:
# Creating clean brazil dataframe 

databr_clean_c = {}

for keys in data_brasil.keys():
    df = data_brasil[keys]
    df = remove_na_cases(df)
    df = get_columns(df)
    df = df_cleaning_cases(df)
    databr_clean_c[keys] = df

In [41]:
## Concat clean dataframes
df_clean_c = pd.concat([databr_clean_c['2020_1'],databr_clean_c['2020_2'],
          databr_clean_c['2021_1'],databr_clean_c['2021_2'],
          databr_clean_c['2022_1'],databr_clean_c['2022_2']])

In [42]:
## Transposing
df_clean_c = df_clean_c.T

In [43]:
# Index Reset
df_clean_c = df_clean_c.reset_index()

In [44]:
# header / index reshaping

new_header = df_clean_c.iloc[0] #grab the first row for the header
df_clean_c = df_clean_c[1:] #take the data less the header row
df_clean_c.columns = new_header #set the header row as the df header

In [45]:
# Rename column 'estado' to match usa/world data

df_clean_c = df_clean_c.rename(columns = {'estado':'Province_State'})

In [46]:
## remove index header

df_clean_c.columns = df_clean_c.columns.rename(None)

In [None]:
## Saved to csv alongside with USA and World data

df_clean_c.to_csv('data/raw/brazil_cases.csv', index= False)

#### Creating brazil_deaths csv

In [None]:
# remove unwated columns
def remove_na_deaths(df):

    df = df.drop(['regiao', 'municipio','coduf', 'codmun', 'codRegiaoSaude', 'nomeRegiaoSaude','semanaEpi','Recuperadosnovos', 'emAcompanhamentoNovos','interior/metropolitana','populacaoTCU2019','casosNovos','casosAcumulado', 'obitosNovos','populacaoTCU2019'], axis=1) 
    df = df.dropna() ## Remove NaN
    return df

In [None]:
## Pivoting and index reset

def df_cleaning_deaths(df):
    
    df = df.pivot(index='estado', columns='data', values='obitosAcumulado')
    df = df.T
    df = df.reset_index()
    df = df.rename(columns = {'data':'Province_State'})
    df['Province_State'] = df['Province_State'].astype(str)
    return df

In [None]:
# Creating clean brazil dataframe 

databr_clean_d = {}

for keys in data_brasil.keys():
    df = data_brasil[keys]
    df = remove_na_deaths(df)
    df = get_columns(df)
    df = df_cleaning_deaths(df)
    databr_clean_d[keys] = df

In [None]:
## Concat clean dataframes
df_clean_d = pd.concat([databr_clean_d['2020_1'],databr_clean_d['2020_2'],
          databr_clean_d['2021_1'],databr_clean_d['2021_2'],
          databr_clean_d['2022_1'],databr_clean_d['2022_2']])

In [None]:
## Transposing
df_clean_d = df_clean_d.T

In [None]:
# Index Reset
df_clean_d = df_clean_d.reset_index()

In [None]:
# header / index reshaping

new_header = df_clean_d.iloc[0] #grab the first row for the header
df_clean_d = df_clean_d[1:] #take the data less the header row
df_clean_d.columns = new_header #set the header row as the df header

In [None]:
# Rename column 'estado' to match usa/world data

df_clean_d = df_clean_d.rename(columns = {'estado':'Province_State'})

In [None]:
## remove index header

df_clean_d.columns = df_clean_d.columns.rename(None)

In [None]:
## Saved to csv alongside with USA and World data

df_clean_d.to_csv('data/raw/brazil_deaths.csv', index= False)