In [1]:
import sys
sys.path.append("../../src/earthtext")

from osm import osm
import os
from progressbar import progressbar as pbar
import shapely as sh 
from pyproj import CRS
import numpy as np
epsg4326 = CRS.from_epsg(4326)
import geopandas as gpd
import pandas as pd
from importlib import reload
from rlxutils import Command
import rlxutils
reload(osm)

<module 'osm.osm' from '/home/ubuntu/earth-text/notebooks/naip/../../src/earthtext/osm/osm.py'>

# partition osm objects from origin grid to destination grid

osm objects have already been partition into `origin_grid`

origin grid is much coarser than destination grid to make more efficient the intersection of osm geometries for each chip

## setup origin grid



In [2]:
orig_partsdir = "/opt/data/osm/california-parts/"
dest_partsdir = "/opt/data/california-naip-chips/osm"
os.makedirs(dest_partsdir, exist_ok=True)
chip_ids_pbf = [i.split(".")[0] for i in os.listdir(orig_partsdir) if i.endswith(".pbf")]
chip_ids_geojson = [i.split(".")[0] for i in os.listdir(orig_partsdir) if i.endswith(".geojson")]

if len(set(chip_ids_pbf).intersection(set(chip_ids_geojson)))!=len(chip_ids_pbf)!=len(chip_ids_geojson):
    raise ValueError("missing chips in geojson o pbf")

In [3]:
orig_grid = []
for chip_id in chip_ids_geojson:
    with open(f"{orig_partsdir}/{chip_id}.geojson") as f:
        orig_grid.append([chip_id, sh.from_geojson(f.read())])

orig_grid = gpd.GeoDataFrame(pd.DataFrame(orig_grid, columns=['chip_id', 'geometry']), crs=epsg4326)
orig_grid.shape

(984, 2)

In [4]:
orig_grid.head()

Unnamed: 0,chip_id,geometry
0,000235e893b24,"POLYGON ((-121.87715 39.14450, -121.65850 39.1..."
1,0002c308d55fc,"POLYGON ((-114.92112 35.05498, -114.92112 35.2..."
2,0018e8bbce095,"POLYGON ((-120.25595 36.95832, -120.55074 36.9..."
3,003fa84955512,"POLYGON ((-119.13010 35.78528, -119.40375 35.7..."
4,004aaa9df44f2,"POLYGON ((-124.14191 40.08352, -124.24849 40.0..."


## setup destination grid

In [5]:
dest_grid = gpd.read_parquet("/opt/data/california-naip-chips/california-naip-chips-200k-raw.parquet")
dest_grid.head()

Unnamed: 0_level_0,date,chip_index_x,chip_index_y,cloud_cover_percentage,nodata_percentage,geometry,embeddings,original_chip_id
chip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3416f3c464df4,2022-05-06,27,5,0.0,0.0,"POLYGON ((-116.89310 33.99492, -116.89310 33.9...","[-0.13333216, 0.0399988, 0.14124717, 0.0125191...",ca_m_3311601_ne_11_060_20220506-27-5
22fdfb499b95a,2022-05-06,24,24,0.0,0.0,"POLYGON ((-116.89812 33.96860, -116.89812 33.9...","[-0.1470401, 0.026783561, 0.12910064, 0.014626...",ca_m_3311601_ne_11_060_20220506-24-24
06e39238c16ee,2022-05-06,19,6,0.0,0.0,"POLYGON ((-116.90640 33.99355, -116.90641 33.9...","[-0.14139079, 0.021682426, 0.14206946, 0.00693...",ca_m_3311601_ne_11_060_20220506-19-6
108471e507d4c,2022-05-06,30,0,0.0,0.0,"POLYGON ((-116.88810 34.00184, -116.88810 34.0...","[-0.10461623, 0.033646498, 0.115653, 0.0179855...",ca_m_3311601_ne_11_060_20220506-30-0
16dd360b58af6,2022-05-06,29,24,0.0,0.0,"POLYGON ((-116.88981 33.96860, -116.88981 33.9...","[-0.13561213, 0.03594061, 0.12552005, 0.009031...",ca_m_3311601_ne_11_060_20220506-29-24


In [6]:
orig_grid.shape, dest_grid.shape

((984, 2), (198919, 8))

## get coverage of destination grid

In [7]:
gcov = dest_grid.geometry.values[0]
for gi in pbar(dest_grid.geometry.values):
    gcov = gcov.union(gi).convex_hull

[38;2;0;255;0m100%[39m [38;2;0;255;0m(198919 of 198919)[39m |################| Elapsed Time: 0:00:08 Time:  0:00:080000


## subset origin grid only to destination grid coverage

In [8]:
orig_grid = orig_grid[[gi.intersects(gcov) for gi in orig_grid.geometry.values]]

In [None]:
orig_grid.explore()

## check distribution of destination grid chips in a couple of sample origin chips

In [None]:
_orig_grid = orig_grid.sample(2)
c = sh.geometry.GeometryCollection(_orig_grid.geometry.values[1:])
dg = dest_grid[[i.intersects(c) for i in dest_grid.geometry.values]]
#pd.concat([dg, _orig_grid])[['geometry']].explore()
dg[['geometry']].explore()


## partition osm objects from origin grid to destination grid

In [14]:
notfound = []
dest_chips_withnodata = []
dest_chips_withdata = []
# loop over coarse grained geometries (origin)
for count, (_, orig_row) in enumerate(orig_grid.iloc[46+27:].iterrows()):
    et = rlxutils.ElapsedTimes()
    orig_row_geom = orig_row.geometry
    dg = dest_grid[[i.intersects(orig_row_geom) for i in dest_grid.geometry.values]]
    print (f"{count}/{len(orig_grid)}", orig_row.chip_id, 'dest_chips', len(dg), flush=True)

    # loop over fine grained geometries (dest). 
    parquets_cache = {}
    for chip_id, dest_row in pbar(dg.iterrows(), max_value=len(dg)):
        
        dest_row_geom = dest_row.geometry
        
        dest_parquet = f"{dest_partsdir}/{chip_id}.parquet"
        if os.path.isfile(dest_parquet):
            continue

        # dest geometry might intersect several origin geometry, 
        # not only the one in the outer loop
        with et("ogintersect"):
            og = orig_grid[[i.intersects(dest_row_geom) for i in orig_grid.geometry.values]]
        
        if len(og) == 0:
            continue

        # loop over all intersecting coarse geometries and
        # get all osm objects
        dgdata = []
        for chip_id in og.chip_id.values:
            if chip_id in parquets_cache.keys():
                p = parquets_cache[chip_id]
            else:
                with et("read_parquet"):
                    fname = f"{orig_partsdir}/{chip_id}.parquet"
                    if not os.path.isfile(fname):
                        notfound.append(fname)
                        continue
                    p = gpd.read_parquet(fname)
                    parquets_cache[chip_id]=p

            # only get the parts of osm geometries within this finer grain geometry
            with et("dgintersect"):
                pi = p[p.intersects(dest_row_geom)].copy()
                pi['geometry'] = [gi.intersection(dest_row_geom) for gi in pi.geometry.values]
            dgdata.append(pi)
        dgdata = pd.concat(dgdata)
        if len(dgdata)==0:
            dest_chips_withnodata.append(chip_id)
            continue
        dest_chips_withdata.append(chip_id)
        with et("save"):
            dgdata.to_parquet(dest_parquet)


0/132 21706a5a16e06 dest_chips 158


[38;2;0;255;0m100%[39m [38;2;0;255;0m(158 of 158)[39m |######################| Elapsed Time: 0:00:04 Time:  0:00:040:00


1/132 2201d9aa51bef dest_chips 1593


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1593 of 1593)[39m |####################| Elapsed Time: 0:08:49 Time:  0:08:490547


2/132 222b87d3538c6 dest_chips 2256


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2256 of 2256)[39m |####################| Elapsed Time: 0:00:13 Time:  0:00:130001


3/132 2299a60abbc22 dest_chips 1468


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1468 of 1468)[39m |####################| Elapsed Time: 0:00:08 Time:  0:00:080000


4/132 2345e2a1647c8 dest_chips 470


[38;2;0;255;0m100%[39m [38;2;0;255;0m(470 of 470)[39m |######################| Elapsed Time: 0:00:20 Time:  0:00:200001


5/132 234da5f5a7679 dest_chips 111


[38;2;0;255;0m100%[39m [38;2;0;255;0m(111 of 111)[39m |######################| Elapsed Time: 0:00:00 Time:  0:00:000000


6/132 237386fcc1e1e dest_chips 1809


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1809 of 1809)[39m |####################| Elapsed Time: 0:00:50 Time:  0:00:500004


7/132 238609afd0621 dest_chips 1716


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1716 of 1716)[39m |####################| Elapsed Time: 0:00:07 Time:  0:00:070000


8/132 23ae42c6edd85 dest_chips 240


[38;2;0;255;0m100%[39m [38;2;0;255;0m(240 of 240)[39m |######################| Elapsed Time: 0:00:06 Time:  0:00:060000


9/132 2423bdf1ff2f0 dest_chips 777


[38;2;255;80;0m  9%[39m [38;2;255;80;0m(72 of 777)[39m |##                     | Elapsed Time: 0:04:15 ETA:   0:41:44

11/132 24a206e1f5cc5 dest_chips 2581


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1601 of 1601)[39m |####################| Elapsed Time: 0:00:08 Time:  0:00:080000


14/132 250ec841c0b3d dest_chips 546


[38;2;0;255;0m100%[39m [38;2;0;255;0m(546 of 546)[39m |######################| Elapsed Time: 0:00:05 Time:  0:00:050000


15/132 251486e119b77 dest_chips 1828


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1828 of 1828)[39m |####################| Elapsed Time: 0:06:57 Time:  0:06:570336


16/132 25bac7f5b2dff dest_chips 94


[38;2;0;255;0m100%[39m [38;2;0;255;0m(94 of 94)[39m |########################| Elapsed Time: 0:00:02 Time:  0:00:020000


17/132 25e45791b8e26 dest_chips 1852


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1852 of 1852)[39m |####################| Elapsed Time: 0:00:18 Time:  0:00:180001


18/132 26087ed2cb9f5 dest_chips 2362


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2362 of 2362)[39m |####################| Elapsed Time: 0:01:28 Time:  0:01:280007


19/132 26612d12cded9 dest_chips 2366


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2366 of 2366)[39m |####################| Elapsed Time: 0:00:10 Time:  0:00:100000


20/132 27a0d844d15db dest_chips 1759


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1759 of 1759)[39m |####################| Elapsed Time: 0:00:17 Time:  0:00:170001


21/132 27e7fa9352e4c dest_chips 1246


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1246 of 1246)[39m |####################| Elapsed Time: 0:00:04 Time:  0:00:040000


22/132 282ead58882b0 dest_chips 316


[38;2;0;255;0m100%[39m [38;2;0;255;0m(316 of 316)[39m |######################| Elapsed Time: 0:00:01 Time:  0:00:010000


23/132 2988a43937c4d dest_chips 1998


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1998 of 1998)[39m |####################| Elapsed Time: 0:00:12 Time:  0:00:120001


24/132 2a60631a1d157 dest_chips 2545


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2545 of 2545)[39m |####################| Elapsed Time: 0:00:07 Time:  0:00:070000


25/132 2aab50e3da3fa dest_chips 1422


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1649 of 1649)[39m |####################| Elapsed Time: 0:01:15 Time:  0:01:150006


29/132 2ce658c1909fb dest_chips 1999


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1999 of 1999)[39m |####################| Elapsed Time: 0:00:50 Time:  0:00:500004


30/132 2d367ece0108b dest_chips 1874


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1874 of 1874)[39m |####################| Elapsed Time: 0:01:50 Time:  0:01:500109


31/132 2d4ed6acfdc29 dest_chips 2069


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2069 of 2069)[39m |####################| Elapsed Time: 0:02:36 Time:  0:02:360113


32/132 2d55a9c35aeab dest_chips 2024


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2024 of 2024)[39m |####################| Elapsed Time: 0:00:14 Time:  0:00:140001


33/132 2da523a929f4b dest_chips 2617


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2617 of 2617)[39m |####################| Elapsed Time: 0:09:43 Time:  0:09:430449


34/132 2dd5f7980813d dest_chips 334


[38;2;0;255;0m100%[39m [38;2;0;255;0m(334 of 334)[39m |######################| Elapsed Time: 0:00:07 Time:  0:00:070000


35/132 2de8beb9de2aa dest_chips 2235


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2235 of 2235)[39m |####################| Elapsed Time: 0:05:35 Time:  0:05:350228


36/132 2de8fffde4e39 dest_chips 2057


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2057 of 2057)[39m |####################| Elapsed Time: 0:00:24 Time:  0:00:240002


37/132 2e19b51381e12 dest_chips 1542


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1542 of 1542)[39m |####################| Elapsed Time: 0:00:10 Time:  0:00:100000


38/132 2e8889c073a17 dest_chips 163


[38;2;0;255;0m100%[39m [38;2;0;255;0m(163 of 163)[39m |######################| Elapsed Time: 0:00:07 Time:  0:00:070000


39/132 2ebbd025caf17 dest_chips 1475


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1475 of 1475)[39m |####################| Elapsed Time: 0:00:10 Time:  0:00:100000


40/132 2ef0567f3cb33 dest_chips 1677


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1677 of 1677)[39m |####################| Elapsed Time: 0:00:06 Time:  0:00:060000


41/132 2f84cbb4d79af dest_chips 2382


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2382 of 2382)[39m |####################| Elapsed Time: 0:00:34 Time:  0:00:340002


42/132 2fc52b6c12bc0 dest_chips 260


[38;2;0;255;0m100%[39m [38;2;0;255;0m(260 of 260)[39m |######################| Elapsed Time: 0:00:02 Time:  0:00:020000


43/132 3010679668a7e dest_chips 2318


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2318 of 2318)[39m |####################| Elapsed Time: 0:00:40 Time:  0:00:400003


44/132 30c4f98a68dba dest_chips 2482


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2482 of 2482)[39m |####################| Elapsed Time: 0:03:30 Time:  0:03:300205


45/132 30d5e1fa877d0 dest_chips 2484


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2484 of 2484)[39m |####################| Elapsed Time: 0:00:21 Time:  0:00:210001


46/132 30ec07421dab4 dest_chips 1966


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1966 of 1966)[39m |####################| Elapsed Time: 0:00:58 Time:  0:00:580005


47/132 312fc06f0e3a9 dest_chips 1174


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1174 of 1174)[39m |####################| Elapsed Time: 0:00:10 Time:  0:00:100000


48/132 31e292b36b8bc dest_chips 2904


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2904 of 2904)[39m |####################| Elapsed Time: 0:04:26 Time:  0:04:260222


49/132 3347520de3a28 dest_chips 1227


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1227 of 1227)[39m |####################| Elapsed Time: 0:04:04 Time:  0:04:040322


50/132 33724d1ba6057 dest_chips 2192


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2192 of 2192)[39m |####################| Elapsed Time: 0:00:11 Time:  0:00:110001


51/132 3478d44602c53 dest_chips 72


[38;2;0;255;0m100%[39m [38;2;0;255;0m(72 of 72)[39m |########################| Elapsed Time: 0:00:00 Time:  0:00:000000


52/132 34a141c36fcce dest_chips 2132


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2132 of 2132)[39m |####################| Elapsed Time: 0:00:22 Time:  0:00:220001


53/132 34e2848865287 dest_chips 1290


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1290 of 1290)[39m |####################| Elapsed Time: 0:00:06 Time:  0:00:060000


54/132 36086a2c32665 dest_chips 1962


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1962 of 1962)[39m |####################| Elapsed Time: 0:00:06 Time:  0:00:060000


55/132 36ab1009f4b2f dest_chips 344


[38;2;0;255;0m100%[39m [38;2;0;255;0m(344 of 344)[39m |######################| Elapsed Time: 0:00:20 Time:  0:00:200001


56/132 36b4f1fbd10ee dest_chips 2882


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2882 of 2882)[39m |####################| Elapsed Time: 0:00:25 Time:  0:00:250001


57/132 373b0b28ea85a dest_chips 2562


[38;2;0;255;0m100%[39m [38;2;0;255;0m(2562 of 2562)[39m |####################| Elapsed Time: 0:00:47 Time:  0:00:470004


58/132 3811c01719920 dest_chips 1937


[38;2;0;255;0m100%[39m [38;2;0;255;0m(1937 of 1937)[39m |####################| Elapsed Time: 0:00:11 Time:  0:00:110001


In [None]:
len(dgdata)