In [1]:
# Imports
import pandas as pd
import numpy as np
import psycopg2, os

# Establish connection and create its cursor
try: 
    conn = psycopg2.connect(f"host={os.environ['AURORA_POSTGRES_HOST']} dbname={os.environ['AURORA_POSTGRES_DATABASE']} user={os.environ['AURORA_POSTGRES_USERNAME']} password={os.environ['AURORA_POSTGRES_PWD']}")
    cur = conn.cursor()
except psycopg2.Error as e: 
    print("Error: Could not make connection to the Postgres database")
    print(e)
    

---
## 1. Restoring the entire IBGE Postgres DB

Follow below process to restore the data in Postgres:</br>

1. **Download dump**: Download file [here](https://geoftp.ibge.gov.br/cartas_e_mapas/bases_cartograficas_continuas/bc250/versao2019/postgis/)</br></br>
2. **Install `postgresql`**: If you don't have it installed you can run `brew install postgresql`</br></br>
3. **Restore the dump**_ Restore the data by running `pg_restore --host $AURORA_POSTGRES_HOST --port 5432 --username $AURORA_POSTGRES_USERNAME --dbname $AURORA_POSTGRES_DATABASE --no-owner --password --verbose "bc250_2019-10-29.tar"` (assuming you have the env variables configured)</br>

**After this process you will have the schema `bc250_2019` created at the `justos` DB.** Let's check tables created in the schema

In [10]:
cur.execute("SELECT * FROM information_schema.tables WHERE table_schema = 'bc250_2019'")
pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])


Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
0,justos,bc250_2019,aer_pista_ponto_pouso_l,BASE TABLE,,,,,,YES,NO,
1,justos,bc250_2019,aer_pista_ponto_pouso_p,BASE TABLE,,,,,,YES,NO,
2,justos,bc250_2019,cbge_cemiterio_p,BASE TABLE,,,,,,YES,NO,
3,justos,bc250_2019,dut_trecho_duto_l,BASE TABLE,,,,,,YES,NO,
4,justos,bc250_2019,eco_ext_mineral_a,BASE TABLE,,,,,,YES,NO,
...,...,...,...,...,...,...,...,...,...,...,...,...
67,justos,bc250_2019,tra_ponte_p,BASE TABLE,,,,,,YES,NO,
68,justos,bc250_2019,tra_travessia_l,BASE TABLE,,,,,,YES,NO,
69,justos,bc250_2019,tra_travessia_p,BASE TABLE,,,,,,YES,NO,
70,justos,bc250_2019,tra_tunel_l,BASE TABLE,,,,,,YES,NO,


The [official documentation](https://geoftp.ibge.gov.br/cartas_e_mapas/bases_cartograficas_continuas/bc250/versao2019/informacoes_tecnicas/Documentacao_bc250_v2019.pdf) of this data dump can be used to understand the schema we have just restored.

---

## 2. Onboarding census tracts data

Alternatively, we have found that the shapefiles with the geometry of each census track can be downloaded [here](https://www.ibge.gov.br/geociencias/organizacao-do-territorio/malhas-territoriais/26565-malhas-de-setores-censitarios-divisoes-intramunicipais.html?=&t=o-que-e).

We download the following data (use the menu on the left side to access it):
- **2010**: Only Sao Paulo, given that files are individual
- **2017**: All Brazil
- **2019**: All Brazil
- **2020**: All Brazil

In [6]:
#!pip install pyshp
import shapefile
import numpy as np
import pandas as pd


### 2.1. Load the data

#### CENSUS TRACKS DATA

In [3]:
# 2010 is only Sao Paulo
shapes_data_2010 = shapefile.Reader("data/sp_setores_censitarios_2010/35SEE250GC_SIR.shp", encoding='latin-1')
# The rest of years all come in one single file
shapes_data_2017 = shapefile.Reader("data/br_setores_censitarios_2017/BR_SETORES_2017_CENSOAGRO.shp", encoding='latin-1')
shapes_data_2019 = shapefile.Reader("data/BR_Setores_2019_20200807/BR_Setores_2019.shp")
shapes_data_2020 = shapefile.Reader("data/BR_Setores_2020/BR_Setores_2020.shp")


Extract the records associated with the shapefiles, from which we can get for example tract codes

In [4]:
code_tracts_2010 = [x[1] for x in shapes_data_2010.iterRecords()]
code_tracts_2017 = [x[0] for x in shapes_data_2017.iterRecords()]
code_tracts_2019 = [x[0] for x in shapes_data_2019.iterRecords()]
code_tracts_2020 = [x[0] for x in shapes_data_2020.iterRecords()]

display(len(code_tracts_2010))
display(len(code_tracts_2017))
display(len(code_tracts_2019))
display(len(code_tracts_2020))


68296

371673

444261

449880

No duplicates in any of the files

In [7]:
np.max([pd.Series(x).value_counts().max() for x in [code_tracts_2010,
                                                    code_tracts_2017,
                                                    code_tracts_2019,
                                                    code_tracts_2020]])


1

Let's get all unique code tracts

In [8]:
all_code_tracts = set(code_tracts_2020).union(set(code_tracts_2019)).union(set(code_tracts_2017)).union(set(code_tracts_2010))
len(all_code_tracts)


531059

#### INFO FROM `susep.geo_info`
Get the geo info

In [9]:
cur.execute("SELECT * FROM susep.geo_info")
geo_info = pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])
geo_info.head()


Unnamed: 0,Id,CEP,code_tract,address_count,lat,lon,reg_susep,cep_inicial,cidade,cep_final,cod_reg,reg_decirc,cep_ini,cep_fim
0,1,76954000,110001520000004,12060,-12.04704,-62.051736,39.0,76740,FAINA,76999,48.0,DEMAIS REGI�ES DE GOI�S,76740,76999
1,777,76873868,110002305000060,11,-9.900819,-63.026831,39.0,76740,FAINA,76999,48.0,DEMAIS REGI�ES DE GOI�S,76740,76999
2,2,76955000,110007205000011,76,-12.983041,-60.952011,39.0,76740,FAINA,76999,48.0,DEMAIS REGI�ES DE GOI�S,76740,76999
3,1553,76965748,110004905000051,8,-11.436342,-61.436585,39.0,76740,FAINA,76999,48.0,DEMAIS REGI�ES DE GOI�S,76740,76999
4,3,76594000,110001505000018,9,-11.928932,-61.990097,39.0,76590,SAO MIGUEL DO ARAGUAIA,76599,48.0,DEMAIS REGI�ES DE GOI�S,76590,76599


`susep.geo_info` contains one row per CEP, and then there is a n-1 mapping CEP to code_tract - several CEPs for one code_tract

In [10]:
display(geo_info.groupby('CEP').code_tract.apply(lambda x: len(x)).sort_values())
display(geo_info.groupby('code_tract').CEP.apply(lambda x: len(x)).sort_values())
print('One row per CEP:', geo_info.shape[0] == geo_info.CEP.nunique())
print('Total rows:', geo_info.shape[0])
print('Total unique code_tract:', geo_info.code_tract.nunique())


CEP
00000001    1
60181560    1
60181550    1
60181530    1
60181520    1
           ..
19046130    1
19046140    1
19046150    1
19046090    1
99999999    1
Name: code_tract, Length: 562192, dtype: int64

code_tract
110001505000018      1
410690205010216      1
410690205010215      1
410690205010214      1
410690205010211      1
                  ... 
521250105000147     75
170210905000115     86
170210905000104     93
320500210000003    102
521250105000148    124
Name: CEP, Length: 140225, dtype: int64

One row per CEP: True
Total rows: 562192
Total unique code_tract: 140225


On the other hand, we now care about the code_tracts in Grande Sao Paulo

In [11]:
geo_info['in_grande_sao_paulo'] = geo_info['code_tract'].str.startswith('35')
geo_info.drop_duplicates(subset=['code_tract'])['in_grande_sao_paulo'].value_counts()


False    96285
True     43940
Name: in_grande_sao_paulo, dtype: int64

### 2.2. Coverage analysis
Reformat the shapefile code_tracts

In [12]:
code_tracts = pd.DataFrame(all_code_tracts, columns=['code_tract'])
code_tracts['in_shapefile']=1
code_tracts.head()

Unnamed: 0,code_tract,in_shapefile
0,510340305410222,1
1,351140905000016,1
2,330455705290159,1
3,150140245000240,1
4,355030876000477,1


Cross the info

In [13]:
dt = geo_info[['CEP', 'code_tract', 'in_grande_sao_paulo']].merge(code_tracts, on='code_tract', how='left')
dt.fillna(0,inplace=True)
dt.head()


Unnamed: 0,CEP,code_tract,in_grande_sao_paulo,in_shapefile
0,76954000,110001520000004,False,1.0
1,76873868,110002305000060,False,1.0
2,76955000,110007205000011,False,1.0
3,76965748,110004905000051,False,1.0
4,76594000,110001505000018,False,1.0


#### COVERAGE BY CEP

**In Grande Sao Paulo we have the shapefile of the census tract containing some given for all CEP's**

In [14]:
pd.DataFrame(dt.groupby(['in_grande_sao_paulo', 'in_shapefile']).CEP.apply(lambda x: len(x)))

Unnamed: 0_level_0,Unnamed: 1_level_0,CEP
in_grande_sao_paulo,in_shapefile,Unnamed: 2_level_1
False,0.0,2993
False,1.0,377642
True,1.0,181557


#### COVERAGE BY CODE TRACT

**For any of the unique code_tracts in Grande Sao Paulo in `susep.geo_info`, we have the code_tract's shapefile in 100% of the cases!** 

In [15]:
pd.DataFrame(dt.drop_duplicates(subset=['code_tract']).groupby(['in_grande_sao_paulo', 'in_shapefile']).CEP.apply(lambda x: len(x)))


Unnamed: 0_level_0,Unnamed: 1_level_0,CEP
in_grande_sao_paulo,in_shapefile,Unnamed: 2_level_1
False,0.0,809
False,1.0,95476
True,1.0,43940


### 2.3. 