In [1]:
'''
In this script, we prepare the census tracts
and split the shapefile in smaller parts
for better performance when running the main
application.
'''

'\nIn this script, we prepare the census tracts\nand split the shapefile in smaller parts\nfor better performance when running the main\napplication.\n'

In [2]:
from geofeather import to_geofeather, from_geofeather
from shapely.geometry import Polygon, MultiPolygon, LineString
from shapely.ops import split
from sys import getsizeof
import geopandas as gpd
import pandas as pd
import glob, multiprocessing, os, shutil
gpd.options.use_pygeos = True
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
def read_data(path_to_tracts, path_to_shp):
    
    dtype = { 
        
        "CD_GEOCODI": str,
        "CD_GEOCODM": str,
        "CD_MUNICIP": str,
        "Cod_setor": str
        
    }
    
    tracts = pd.read_csv(path_to_tracts, dtype=dtype)
    
    shp = gpd.read_file(path_to_shp, dtype=dtype)
    
    return tracts, shp

In [4]:
def merge_tracts_and_shape(tracts, shp):
    
    return shp.merge(tracts, left_on='CD_GEOCODI', right_on='Cod_setor', how='left')

In [5]:
def divide_bbox(rectangle, nrows, ncols): 
    '''
    Divides a rectangular bounding box in
    rows and columns

    Reference: https://stackoverflow.com/questions/58283684/how-to-divide-a-rectangle-in-specific-number-of-rows-and-columns
    '''

    minx, miny, maxx, maxy = rectangle.bounds

    dx = (maxx - minx) / nrows  # width of a small part

    dy = (maxy - miny) / ncols  # height of a small part

    horizontal_splitters = [LineString([(minx, miny + i*dy), (maxx, miny + i*dy)]) for i in range(ncols)]

    vertical_splitters = [LineString([(minx + i*dx, miny), (minx + i*dx, maxy)]) for i in range(nrows)]

    splitters = horizontal_splitters + vertical_splitters

    for splitter in splitters:
        rectangle = MultiPolygon(split(rectangle, splitter))

    return [ split_rectangle for split_rectangle in rectangle ]



In [18]:
def find_neighbors(row, gdf):
    '''
    Finds all the polygons in the GeoDataFrame
    that are neighbors to the current row
    '''

    neighbors = gdf [ ~gdf.geometry.disjoint(row.geometry)].id_no.astype(str).tolist() 
    
    neighbors = "|".join(neighbors)

    return pd.Series({"neighbors": neighbors})

In [19]:
def find_intersections(tracts, spatial_index, area):
    '''
    Finds all the polygons that intersect a given area
    '''
    
    # Uses Geopandas/PyGeos rtree to pre-filter the tracts
    nearby_index = list(spatial_index.intersection(area.bounds))
    
    nearby_tracts = tracts.iloc[nearby_index]

    # Selects the tracts that do intersect with the area
    matches = nearby_tracts [ nearby_tracts.geometry.intersects(area)]

    return matches

In [20]:
def compute_population_in_area(matches, area):
    '''
    Calculates how many people live in the intersecting polygons.
    Also returns an array with the intersecting shapes.
    '''

    def process_intersection(population, tract, polygon):

        intersection = tract.intersection(polygon)

        intersection_percentage = intersection.area / tract.area 

        population_in_intersection = population * intersection_percentage

        return intersection, intersection_percentage, population_in_intersection
    
    

    intersection, intersection_percentage, population_in_intersection = process_intersection(matches.populacao_residente.values,
                                     matches.geometry.values,
                                     area)

    matches['geometry'] = intersection
    
    matches['INTERSECT'] = intersection_percentage.round(2)

    matches['POP_INTER'] = population_in_intersection.round()

    return matches.reset_index(drop=True)
    

In [28]:
def main():    
    
    print("Reading data")
    
    df, gdf = read_data("../data/censo_dados_resumidos.csv","../data/setores_censitarios_shp_reduzido/")
    
    gdf = merge_tracts_and_shape(df, gdf)    
    
    gdf.geometry = gdf.geometry.buffer(0)
    
    print("Creating spatial index")
    
    sindex = gdf.sindex
    
    print("Creating bounding boxes")
    
    brazil_bbox = Polygon([
        [-74.3143068749,-34.2970741167],
        [-34.4119631249,-34.2970741167],
        [-34.4119631249,5.648611595],
        [-74.3143068749,5.648611595],
        [-74.3143068749,-34.2970741167]
    ])
    
    
    bboxes = divide_bbox(brazil_bbox, 150, 150)
    
    bboxes = gpd.GeoDataFrame(geometry=bboxes).reset_index().rename(columns={'index':'id_no'})
    
    bboxescrs = gdf.crs
    
    print("Finding neighbors")
    
    bboxes['neighbors'] = bboxes.apply(find_neighbors, args=[bboxes], axis=1)

    bboxes['neighbor_count'] = bboxes.neighbors.str.len()
    
    
    # TO DO: isolate this in a function to increase efficiency
    # TO DO: implement this using multithreading and dataframe chunks
    # See https://stackoverflow.com/questions/40357434/pandas-df-iterrows-parallelization
    
    print("Preparing geofeathers")
    
    meaningful_bboxes = [ ]
    
    bboxes["fpath"] = None
    
    bboxes["total_population"] = None
    
    # directory =  "../data/setores_censitarios_divididos/"
    
    directory =  "../data/setores_censitarios_divididos_feather/"
        
    # Clears the directory of any file 
    
    for f in glob.glob(directory + "*"):
        
        os.remove(f)
    
    for index, row in bboxes.iterrows():
        
        print(f"Processing bounding box {index + 1} of {bboxes.shape[0]}".ljust(100), end='\r')
        
        # Takes a polygon
        
        bbox = row.geometry
        
        # Finds the intersecting tracts and do the relevant computations
        
        matches = find_intersections(gdf, sindex, bbox)
        
        matches = compute_population_in_area(matches, bbox)
        
        # Adds relevant information to the bboxes dataframe
        
        total_population = matches.POP_INTER.sum()        
        
        #fpath = f"../data/setores_censitarios_divididos/bbox-{index}.shp"
        
        fname =  f'bbox-{index}.feather'
        
        fpath = directory + fname
        
        bboxes.loc[index, "fpath"] = fpath
        
        bboxes.loc[index, "total_population"] = round(total_population)
        
        # If relevant, saves
        if matches.shape[0] != 0:
            
            meaningful_bboxes.append(index)
            
            #matches.to_file(fpath)
            
            to_geofeather(matches, fpath)
    
    
    # Remove from the data table all the bounding boxes that contain no tracts
    bboxes = bboxes.loc[meaningful_bboxes].reset_index(drop=False)
    
    #bboxes.to_csv("../data/index_bboxes.csv", index=False)
    
    to_geofeather(bboxes, "../data/index_bboxes.feather")
    
    return bboxes

In [29]:
if __name__ == "__main__":
    main()

Reading data
Creating spatial index
Creating bounding boxes
Finding neighbors
Preparing geofeathers
Processing bounding box 22500 of 22500                                                              

In [30]:
%%time
x = from_geofeather("../data/setores_censitarios_divididos_feather/bbox-6104.feather")

CPU times: user 229 ms, sys: 12.1 ms, total: 241 ms
Wall time: 237 ms


In [31]:
table = from_geofeather("../data/index_bboxes.feather")

In [40]:
table.total_population.min()

0.0

In [None]:
%%time
y = gpd.read_file("../data/setores_censitarios_divididos/bbox-2669.shp")

In [None]:
y.head()

In [None]:
df, gdf = read_data("../data/censo_dados_resumidos.csv","../data/setores_censitarios_shp_reduzido/")

In [None]:
type(gdf.loc[0, 'geometry'])