In [1]:
import psycopg2, os
import pandas as pd
import geopandas
from shapely import wkt

# Establish connection and create its cursor
try: 
    conn = psycopg2.connect(f"host={os.environ['AURORA_POSTGRES_HOST']} dbname={os.environ['AURORA_POSTGRES_DATABASE']} user={os.environ['AURORA_POSTGRES_USERNAME']} password={os.environ['AURORA_POSTGRES_PWD']}")
    cur = conn.cursor()
except psycopg2.Error as e: 
    print("Error: Could not make connection to the Postgres database")
    print(e)
    

In [6]:
cur.execute("SELECT * FROM susep.geo_info WHERE code_tract='355030835000140'")
pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])

Unnamed: 0,Id,CEP,code_tract,address_count,lat,lon,reg_susep,cep_inicial,cidade,cep_final,cod_reg,reg_decirc,cep_ini,cep_fim
0,30350,4705000,355030835000140,110,-22.203469,-46.261743,11,1000,S�O PAULO,5999,14,METROPOLITANA DE S�O PAULO,1000,5999
1,369339,4703000,355030835000140,27,-23.622132,-46.692758,11,1000,S�O PAULO,5999,14,METROPOLITANA DE S�O PAULO,1000,5999
2,369341,4705060,355030835000140,43,-23.622255,-46.692515,11,1000,S�O PAULO,5999,14,METROPOLITANA DE S�O PAULO,1000,5999
3,369360,4703020,355030835000140,11,-23.622312,-46.692403,11,1000,S�O PAULO,5999,14,METROPOLITANA DE S�O PAULO,1000,5999
4,369361,4703030,355030835000140,27,-23.622312,-46.692403,11,1000,S�O PAULO,5999,14,METROPOLITANA DE S�O PAULO,1000,5999
5,369362,4705040,355030835000140,4,-23.622312,-46.692403,11,1000,S�O PAULO,5999,14,METROPOLITANA DE S�O PAULO,1000,5999
6,369363,4705050,355030835000140,29,-23.622312,-46.692403,11,1000,S�O PAULO,5999,14,METROPOLITANA DE S�O PAULO,1000,5999


In [3]:
conn.rollback()

Get Sao Paulo census tracts with their roubo e furto info

In [2]:
statement = """
    SELECT
        c.census_tract,
        r.census_tract_area,
        r.total_incidents,
        r.geog_density_incidents,
        c.census_tract_geom_text
    FROM (
        SELECT
            cd_geocodi as census_tract,
            census_tract_geom_text
        FROM susep.census_tract_detail
        WHERE cd_geocodi LIKE '35%'
        ) as c
    LEFT JOIN (
        SELECT 
            *
        FROM dbt_albertoscf.roubo_e_furto_density
        ) as r
    ON r.census_tract = c.census_tract
    """

cur.execute(statement)
dt = pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])
dt.head()


Unnamed: 0,census_tract,census_tract_area,total_incidents,geog_density_incidents,census_tract_geom_text
0,350635905000052,,,,"POLYGON((-46.119915 -23.801814999999998, -46.1..."
1,350690405000020,,,,POLYGON((-48.309927450470795 -23.2110145643571...
2,350745605000008,,,,"POLYGON((-48.99183378666601 -22.5689597156505,..."
3,350750610000012,,,,"POLYGON((-48.80256796118729 -22.8453336950107,..."
4,350750615000008,,,,"POLYGON((-48.3388833020193 -22.6240319292293, ..."


Fill NA and export

In [7]:
dt.geog_density_incidents.describe()

count    2.870900e+04
mean     7.548519e+05
std      1.381457e+06
min      4.335607e+01
25%      1.374274e+05
50%      3.667642e+05
75%      8.676246e+05
max      4.680630e+07
Name: geog_density_incidents, dtype: float64

In [30]:
# Fill NA conservatively
print('Total rows:', dt.shape[0])
print('Crime NA rows (no recorded roubo o furto):')
display(dt.isna().sum())
dt.fillna(0, inplace=True)

# Convert to polygon shapes
dt['census_tract_geom_text'] = dt['census_tract_geom_text'].apply(wkt.loads)
dt = geopandas.GeoDataFrame(dt, geometry='census_tract_geom_text', crs="EPSG:4326")
dt.head()


Total rows: 68296
Crime NA rows (no recorded roubo o furto):


census_tract                  0
census_tract_area         39587
total_incidents           39587
geog_density_incidents    39587
census_tract_geom_text        0
dtype: int64

Unnamed: 0,census_tract,census_tract_area,total_incidents,geog_density_incidents,census_tract_geom_text
0,350635905000052,0.0,0.0,0.0,"POLYGON ((-46.11991 -23.80181, -46.11992 -23.8..."
1,350690405000020,0.0,0.0,0.0,"POLYGON ((-48.30993 -23.21101, -48.31023 -23.2..."
2,350745605000008,0.0,0.0,0.0,"POLYGON ((-48.99183 -22.56896, -48.99135 -22.5..."
3,350750610000012,0.0,0.0,0.0,"POLYGON ((-48.80257 -22.84533, -48.80210 -22.8..."
4,350750615000008,0.0,0.0,0.0,"POLYGON ((-48.33888 -22.62403, -48.33835 -22.6..."


In [31]:
dt.to_file("geo_data.json", driver="GeoJSON") 