# Uploading population density data

Estimated population density data is distributed by municipality by the IBGE:
- Link to data: https://www.ibge.gov.br/estatisticas/sociais/populacao/9103-estimativas-de-populacao.html?=&t=o-que-e

In this notebook we onboard 2021 data by municipality, which we can later map to census tracts - being `census_tract:municipality` a `n:1` relationship

In [1]:
# Imports
import pandas as pd
import numpy as np
import psycopg2, os
from tqdm import tqdm
import matplotlib.pyplot as plt

#!pip install pyshp
import shapefile

# Establish connection and create its cursor
try: 
    conn = psycopg2.connect(f"host={os.environ['AURORA_POSTGRES_HOST']} dbname={os.environ['AURORA_POSTGRES_DATABASE']} user={os.environ['AURORA_POSTGRES_USERNAME']} password={os.environ['AURORA_POSTGRES_PWD']}")
    cur = conn.cursor()
except psycopg2.Error as e: 
    print("Error: Could not make connection to the Postgres database")
    print(e)
    

---

## 1. Read input data


In [2]:
data = pd.read_excel('data/estimated_population_density_ibge/POP2021_20211029.xls', 
                     sheet_name='Municípios', header=None)
data.columns = data.loc[1]
data = data.drop([0,1]).dropna().reset_index().drop(columns='index').rename(columns={'UF': 'uf',
                                                                                     'COD. UF': 'cod_uf',
                                                                                     'COD. MUNIC': 'cod_munic',
                                                                                     'NOME DO MUNICÍPIO': 'nome_do_municipio',
                                                                                     'POPULAÇÃO ESTIMADA': 'populacao_estimada_2021'})
data


1,uf,cod_uf,cod_munic,nome_do_municipio,populacao_estimada_2021
0,RO,11,00015,Alta Floresta D'Oeste,22516
1,RO,11,00023,Ariquemes,111148
2,RO,11,00031,Cabixi,5067
3,RO,11,00049,Cacoal,86416
4,RO,11,00056,Cerejeiras,16088
...,...,...,...,...,...
5565,GO,52,22005,Vianópolis,14088
5566,GO,52,22054,Vicentinópolis,9002
5567,GO,52,22203,Vila Boa,6451
5568,GO,52,22302,Vila Propício,5941


Check some specific example, consistent with https://cidades.ibge.gov.br/brasil/sp/sao-paulo/panorama

In [3]:
data[(data['cod_uf']==35) & (data["cod_munic"]=="50308")]

1,uf,cod_uf,cod_munic,nome_do_municipio,populacao_estimada_2021
3829,SP,35,50308,São Paulo,12396372


Fix some footnoted values presented as strings

In [4]:
data[data.populacao_estimada_2021.apply(lambda x: type(x)==str)]

1,uf,cod_uf,cod_munic,nome_do_municipio,populacao_estimada_2021
16,RO,11,205,Porto Velho,548.952(1)
83,AM,13,607,Benjamin Constant,44.873(2)
88,AM,13,839,Caapiranga,13.482(3)
98,AM,13,1654,Guajará,17.193(4)
108,AM,13,2405,Lábrea,47.685(5)
110,AM,13,2553,Manaquiri,33.981(6)
119,AM,13,3403,Parintins,116.439(7)
123,AM,13,3601,Santa Isabel do Rio Negro,26.566(8)
129,AM,13,4062,Tabatinga,68.502(9)
134,AM,13,4302,Urucará,16.007(10)


In [5]:
data['populacao_estimada_2021'] = data.populacao_estimada_2021.apply(lambda x: int(x.split("(")[0].replace(".", "")) if type(x)==str else x).astype(int)
data.loc[[16,83,88]]

1,uf,cod_uf,cod_munic,nome_do_municipio,populacao_estimada_2021
16,RO,11,205,Porto Velho,548952
83,AM,13,607,Benjamin Constant,44873
88,AM,13,839,Caapiranga,13482


---
## 2. Postgres table and populate

Create the table

In [8]:
# Create schema if it doesn't exist
cur.execute("CREATE SCHEMA IF NOT EXISTS ibge")
conn.commit()

# Now create the table if it doesn't exist
create_statement = f"CREATE TABLE IF NOT EXISTS ibge.population_by_municipality_2021 ({', '.join([col+' VARCHAR' if col!='populacao_estimada_2021' else col+' INTEGER' for col in data.columns])})"
print(create_statement)
cur.execute(create_statement)
conn.commit()

# Truncate table
cur.execute("TRUNCATE TABLE ibge.population_by_municipality_2021")
conn.commit()



CREATE TABLE IF NOT EXISTS ibge.population_by_municipality_2021 (uf VARCHAR, cod_uf VARCHAR, cod_munic VARCHAR, nome_do_municipio VARCHAR, populacao_estimada_2021 INTEGER)


Insert the records

In [9]:
# Insert the records
for i, record in tqdm(data.iterrows()):
    insert_statement = f"""INSERT INTO ibge.population_by_municipality_2021 ({', '.join(record.index)}) VALUES ({', '.join(['%s']*len(record))})"""
    cur.execute(insert_statement,
                tuple([v for k,v in record.items()])
               )
    
conn.commit()


5570it [11:05,  8.37it/s]
