# Uploading population density data

We create a table containing the information from IBGE on:
- **Estimated population by municipio**: https://www.ibge.gov.br/estatisticas/sociais/populacao/9103-estimativas-de-populacao.html?=&t=o-que-e
- **Area by municipio**: https://www.ibge.gov.br/geociencias/organizacao-do-territorio/estrutura-territorial/15761-areas-dos-municipios.html?=&t=o-que-e

With this information we can also calculate the population density by municipio. This density can be attributed to each census tract based on the municipio they are on, where the relationship `census_tract:municipality` is `n:1`.

In [1]:
# Imports
import pandas as pd
import numpy as np
import psycopg2, os
from tqdm import tqdm
import matplotlib.pyplot as plt

#!pip install pyshp
import shapefile

# Establish connection and create its cursor
try: 
    conn = psycopg2.connect(f"host={os.environ['AURORA_POSTGRES_HOST']} dbname={os.environ['AURORA_POSTGRES_DATABASE']} user={os.environ['AURORA_POSTGRES_USERNAME']} password={os.environ['AURORA_POSTGRES_PWD']}")
    cur = conn.cursor()
except psycopg2.Error as e: 
    print("Error: Could not make connection to the Postgres database")
    print(e)
    

---

## 1. Read population data


In [2]:
population_data = pd.read_excel('data/estimated_population_density_ibge/POP2021_20211029.xls', 
                                sheet_name='Municípios', header=None)
population_data.columns = population_data.loc[1]
population_data = population_data.drop([0,1]).dropna().reset_index().drop(columns='index').rename(columns={'UF': 'uf',
                                                                                                           'COD. UF': 'cod_uf',
                                                                                                           'COD. MUNIC': 'cod_munic',
                                                                                                           'NOME DO MUNICÍPIO': 'nome_do_municipio',
                                                                                                           'POPULAÇÃO ESTIMADA': 'populacao_estimada_2021'})
population_data['cod_uf'] = population_data['cod_uf'].astype(str)
population_data


1,uf,cod_uf,cod_munic,nome_do_municipio,populacao_estimada_2021
0,RO,11,00015,Alta Floresta D'Oeste,22516
1,RO,11,00023,Ariquemes,111148
2,RO,11,00031,Cabixi,5067
3,RO,11,00049,Cacoal,86416
4,RO,11,00056,Cerejeiras,16088
...,...,...,...,...,...
5565,GO,52,22005,Vianópolis,14088
5566,GO,52,22054,Vicentinópolis,9002
5567,GO,52,22203,Vila Boa,6451
5568,GO,52,22302,Vila Propício,5941


Check some specific example, consistent with https://cidades.ibge.gov.br/brasil/sp/sao-paulo/panorama

In [3]:
population_data[(population_data['cod_uf']=="35") & (population_data["cod_munic"]=="50308")]

1,uf,cod_uf,cod_munic,nome_do_municipio,populacao_estimada_2021
3829,SP,35,50308,São Paulo,12396372


Fix some footnoted values presented as strings

In [4]:
population_data[population_data.populacao_estimada_2021.apply(lambda x: type(x)==str)]

1,uf,cod_uf,cod_munic,nome_do_municipio,populacao_estimada_2021
16,RO,11,205,Porto Velho,548.952(1)
83,AM,13,607,Benjamin Constant,44.873(2)
88,AM,13,839,Caapiranga,13.482(3)
98,AM,13,1654,Guajará,17.193(4)
108,AM,13,2405,Lábrea,47.685(5)
110,AM,13,2553,Manaquiri,33.981(6)
119,AM,13,3403,Parintins,116.439(7)
123,AM,13,3601,Santa Isabel do Rio Negro,26.566(8)
129,AM,13,4062,Tabatinga,68.502(9)
134,AM,13,4302,Urucará,16.007(10)


In [5]:
population_data['populacao_estimada_2021'] = population_data.populacao_estimada_2021.apply(lambda x: int(x.split("(")[0].replace(".", "")) if type(x)==str else x).astype(int)
population_data.loc[[16,83,88]]


1,uf,cod_uf,cod_munic,nome_do_municipio,populacao_estimada_2021
16,RO,11,205,Porto Velho,548952
83,AM,13,607,Benjamin Constant,44873
88,AM,13,839,Caapiranga,13482


---

## 2. Read area by municipio data



In [6]:
area_data = pd.read_excel('data/estimated_population_density_ibge/AR_BR_RG_UF_RGINT_RGIM_MES_MIC_MUN_2020.xls', 
                          sheet_name='AR_BR_MUN_2020', header=None)
area_data.columns = area_data.loc[0]

columns = {'NM_UF_SIGLA': 'uf',
           'CD_GCUF' :'cod_uf',
           'CD_GCMUN':'cod_munic',
           'NM_MUN_2020': 'nome_do_municipio',
           'AR_MUN_2020': 'area_estimada_2020'}
area_data = area_data.drop([0]).dropna().reset_index().rename(columns=columns)[list(columns.values())]
area_data


Unnamed: 0,uf,cod_uf,cod_munic,nome_do_municipio,area_estimada_2020
0,RO,11,1100015,ALTA FLORESTA D'OESTE,7067.127
1,RO,11,1100023,ARIQUEMES,4426.571
2,RO,11,1100031,CABIXI,1314.352
3,RO,11,1100049,CACOAL,3793
4,RO,11,1100056,CEREJEIRAS,2783.3
...,...,...,...,...,...
5567,GO,52,5222005,VIANÓPOLIS,954.284
5568,GO,52,5222054,VICENTINÓPOLIS,737.255
5569,GO,52,5222203,VILA BOA,1060.172
5570,GO,52,5222302,VILA PROPÍCIO,2181.583


The two first digits of the `cod_munic` should be `cod_uf` so we can strip it

In [7]:
## print('Cases where cod_munic first two digits is not cod_uf:', (area_data.cod_uf != area_data.cod_munic.str[:2]).sum())
area_data['cod_munic'] = area_data.cod_munic.str[-5:]


Transform area data to the right format

In [8]:
area_data['area_estimada_2020'] = area_data.area_estimada_2020.astype(float)

---

## 3. Merge data and calculate population density

### 3.1. Merge the data
We merge on `['cod_uf, cod_munic`]

In [9]:
data = population_data.merge(area_data, on=['cod_uf', 'cod_munic'], how='outer')
data

Unnamed: 0,uf_x,cod_uf,cod_munic,nome_do_municipio_x,populacao_estimada_2021,uf_y,nome_do_municipio_y,area_estimada_2020
0,RO,11,00015,Alta Floresta D'Oeste,22516.0,RO,ALTA FLORESTA D'OESTE,7067.127
1,RO,11,00023,Ariquemes,111148.0,RO,ARIQUEMES,4426.571
2,RO,11,00031,Cabixi,5067.0,RO,CABIXI,1314.352
3,RO,11,00049,Cacoal,86416.0,RO,CACOAL,3793.000
4,RO,11,00056,Cerejeiras,16088.0,RO,CEREJEIRAS,2783.300
...,...,...,...,...,...,...,...,...
5567,GO,52,22203,Vila Boa,6451.0,GO,VILA BOA,1060.172
5568,GO,52,22302,Vila Propício,5941.0,GO,VILA PROPÍCIO,2181.583
5569,DF,53,00108,Brasília,3094325.0,DF,BRASÍLIA,5760.784
5570,,43,00001,,,RS,LAGOA MIRIM,2872.364


We have two municipalities with no estimated population. We drop them

In [10]:
print('Municipalities with population data but no area data:')
display(set(population_data.apply(lambda row: row['cod_uf']+row['cod_munic'], axis=1)) - set(area_data.apply(lambda row: row['cod_uf']+row['cod_munic'], axis=1)))
print('Municipalities with area data but no population data:')
display(set(area_data.apply(lambda row: row['cod_uf']+row['cod_munic'], axis=1)) - set(population_data.apply(lambda row: row['cod_uf']+row['cod_munic'], axis=1)))

display(data.shape[0])
data.dropna(inplace=True)
display(data.shape[0])


Municipalities with population data but no area data:


set()

Municipalities with area data but no population data:


{'4300001', '4300002'}

5572

5570

### 3.2. Quick sanity checks and cleanups


In [19]:
print('Any inconsistencies in UF name?:', bool((data['uf_x']!=data['uf_y']).sum()))
print('Any differences in municipality name?:')
display(data[(data['nome_do_municipio_x'].str.lower()!=data['nome_do_municipio_y'].str.lower())])

# Drop and rename
data = data.drop(columns=['uf_y', 'nome_do_municipio_y']).rename(columns={'uf_x': 'uf',
                                                                          'nome_do_municipio_x': 'nome_do_municipio'})
data


Any inconsistencies in UF name?: False
Any differences in municipality name?:


Unnamed: 0,uf_x,cod_uf,cod_munic,nome_do_municipio_x,populacao_estimada_2021,uf_y,nome_do_municipio_y,area_estimada_2020
4408,SC,42,6108,Grão Pará,6621.0,SC,GRÃO-PARÁ,334.362


Unnamed: 0,uf,cod_uf,cod_munic,nome_do_municipio,populacao_estimada_2021,area_estimada_2020
0,RO,11,00015,Alta Floresta D'Oeste,22516.0,7067.127
1,RO,11,00023,Ariquemes,111148.0,4426.571
2,RO,11,00031,Cabixi,5067.0,1314.352
3,RO,11,00049,Cacoal,86416.0,3793.000
4,RO,11,00056,Cerejeiras,16088.0,2783.300
...,...,...,...,...,...,...
5565,GO,52,22005,Vianópolis,14088.0,954.284
5566,GO,52,22054,Vicentinópolis,9002.0,737.255
5567,GO,52,22203,Vila Boa,6451.0,1060.172
5568,GO,52,22302,Vila Propício,5941.0,2181.583


### 3.3. Calculate population density


In [21]:
data['estimated_population_density'] = data['populacao_estimada_2021'].div(data['area_estimada_2020'])

### 3.4. Set data types

In [23]:
data_types = {'uf': {'pandas': str, 'postgres': 'VARCHAR(2)'},
              'cod_uf': {'pandas': str, 'postgres': 'VARCHAR(2)'},
              'cod_munic': {'pandas': str, 'postgres': 'VARCHAR(5)'},
              'nome_do_municipio': {'pandas': str, 'postgres': 'VARCHAR'},
              'populacao_estimada_2021': {'pandas': int, 'postgres': 'INTEGER'},
              'area_estimada_2020': {'pandas': float, 'postgres': 'NUMERIC'},
              'estimated_population_density': {'pandas': float, 'postgres': 'NUMERIC'}}

for col in data.columns:
    data[col] = data[col].astype(data_types[col]['pandas'])
    
data.head()

Unnamed: 0,uf,cod_uf,cod_munic,nome_do_municipio,populacao_estimada_2021,area_estimada_2020,estimated_population_density
0,RO,11,15,Alta Floresta D'Oeste,22516,7067.127,3.186019
1,RO,11,23,Ariquemes,111148,4426.571,25.109278
2,RO,11,31,Cabixi,5067,1314.352,3.855132
3,RO,11,49,Cacoal,86416,3793.0,22.783021
4,RO,11,56,Cerejeiras,16088,2783.3,5.780189


---
## 4. Postgres table and populate

Create the table

In [25]:
# Create schema if it doesn't exist
cur.execute("CREATE SCHEMA IF NOT EXISTS ibge")
conn.commit()

# Now create the table if it doesn't exist
create_statement = f"CREATE TABLE IF NOT EXISTS ibge.population_by_municipality ({', '.join([col+' '+data_types[col]['postgres'] for col in data.columns])})"
print(create_statement)
cur.execute(create_statement)
conn.commit()

# Truncate table
cur.execute("TRUNCATE TABLE ibge.population_by_municipality")
conn.commit()



CREATE TABLE IF NOT EXISTS ibge.population_by_municipality (uf VARCHAR(2), cod_uf VARCHAR(2), cod_munic VARCHAR(5), nome_do_municipio VARCHAR, populacao_estimada_2021 INTEGER, area_estimada_2020 NUMERIC, estimated_population_density NUMERIC)


Insert the records

In [None]:
# Insert the records
for i, record in tqdm(data.iterrows()):
    insert_statement = f"""INSERT INTO ibge.population_by_municipality ({', '.join(record.index)}) VALUES ({', '.join(['%s']*len(record))})"""
    cur.execute(insert_statement,
                tuple([v for k,v in record.items()])
               )
    
conn.commit()


87it [00:13,  7.42it/s]