In [1]:
import sys
sys.path.append("../../src/earthtext")

from osm import osm
import os
from progressbar import progressbar as pbar
import shapely as sh 
from pyproj import CRS
import numpy as np
epsg4326 = CRS.from_epsg(4326)
import geopandas as gpd
import pandas as pd
from importlib import reload
from rlxutils import Command, mParallel
from joblib import delayed, Parallel
from time import time
import folium
reload(osm)

<module 'osm.osm' from '/home/ubuntu/earth-text/notebooks/naip/../../src/earthtext/osm/osm.py'>

# post process chip level osm objects

- recomputes areas and length of osm objects in each chip . this is necessary since original osm geometries might have been split into several chips

- removes the tags not considered inferrable from a satellite image (such as 'county', 'fixme', 'religion', 'internet_access', etc.)

- removes the osm object which, as a result of previous step, are left with no osm tags.

- remove the chip osm.parquet file if, as a result of all the previous, the chip ends up with no osm objects.

- creates a string representation of the tags (for visualization, etc.)

In [2]:
print ("removing the following tags")
print ("---------------------------")
print (osm.ignore_tags)

removing the following tags
---------------------------
['created_by', 'addr:', 'gnis:', 'gtfs_id', 'tiger:', 'name', 'source', 'nhd:', 'wikipedia', 'ref', 'attribution', 'note', 'protection_title', 'wikidata', 'fmmp_modified', 'fmmp_reviewed', 'ref:', 'plant:', 'access', 'contact', 'acres', 'ele', 'oldref', 'oneway', 'lanes:', 'boundary', 'border_type', 'admin_level', 'fixme', 'altname', 'lanes', 'maxspeed', 'maxspeed:', 'operator:', 'operator', 'county_nam', 'source:', 'website', 'old_ref', 'name_1', 'alt_name', 'name:', 'handicapped_accessible', 'old_railway_operator', 'caltrans:', 'aland', 'areaid', 'awater', 'mtfcc', 'latitude', 'longitude', 'military', 'source_ref', 'payment:', 'ohv', 'lot_description', 'lot_type', 'barrier', 'access:', 'horse', 'tracktype', 'layer', 'cables', 'volcano:', 'intermitent', 'foot', 'bicycle', 'motor_vehicle', 'seasonal', 'description', 'proposeinternet_accessd:', 'bakersfield:', 'official_name', 'official_name_1', 'destination', 'junction:', 'owner',

In [3]:
dg = gpd.read_parquet("/opt/data/california-naip-chips/california-naip-chips-200k-raw.parquet")
dg.head()

Unnamed: 0_level_0,date,chip_index_x,chip_index_y,cloud_cover_percentage,nodata_percentage,geometry,embeddings,original_chip_id
chip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3416f3c464df4,2022-05-06,27,5,0.0,0.0,"POLYGON ((-116.89310 33.99492, -116.89310 33.9...","[-0.13333216, 0.0399988, 0.14124717, 0.0125191...",ca_m_3311601_ne_11_060_20220506-27-5
22fdfb499b95a,2022-05-06,24,24,0.0,0.0,"POLYGON ((-116.89812 33.96860, -116.89812 33.9...","[-0.1470401, 0.026783561, 0.12910064, 0.014626...",ca_m_3311601_ne_11_060_20220506-24-24
06e39238c16ee,2022-05-06,19,6,0.0,0.0,"POLYGON ((-116.90640 33.99355, -116.90641 33.9...","[-0.14139079, 0.021682426, 0.14206946, 0.00693...",ca_m_3311601_ne_11_060_20220506-19-6
108471e507d4c,2022-05-06,30,0,0.0,0.0,"POLYGON ((-116.88810 34.00184, -116.88810 34.0...","[-0.10461623, 0.033646498, 0.115653, 0.0179855...",ca_m_3311601_ne_11_060_20220506-30-0
16dd360b58af6,2022-05-06,29,24,0.0,0.0,"POLYGON ((-116.88981 33.96860, -116.88981 33.9...","[-0.13561213, 0.03594061, 0.12552005, 0.009031...",ca_m_3311601_ne_11_060_20220506-29-24


In [4]:
i = dg.index[0]
i

'3416f3c464df4'

In [5]:
f = f"/opt/data/california-naip-chips/osm/{i}.parquet"
!ls $f

/opt/data/california-naip-chips/osm/3416f3c464df4.parquet


In [6]:
# get only chips with data
dgd = dg[[os.path.isfile(f"/opt/data/california-naip-chips/osm/{i}.parquet") for i in dg.index]]
dgd.shape

(120039, 8)

In [7]:
class mParallel(Parallel):
    """
    substitutes joblib.Parallel with richer verbose progress information
    """
    def _print(self, msg):
        if self.verbose > 10:
            fmsg = '[%s]: %s' % (self, msg)
            sys.stdout.write('\r ' + fmsg)
            sys.stdout.flush()


In [55]:
def process_chipid(chip_id, geometry):
    fname = f"/opt/data/california-naip-chips/osm/{chip_id}.parquet"
    if not os.path.isfile(fname):
        return "no_file"
    z = gpd.read_parquet(fname)

    if len(z)==0:
        return "no_geoms"
        
    z['tags'] = [{k:v for k,v in (t.items() if type(t)==dict else eval(t).items()) if v is not None} for t in z.tags]
    z['tags'] = [osm.clean_tags(t) for t in z.tags]
    z = z[[len(t)>0 for t in z.tags]]
    
    if len(z)==0:
        if os.path.isfile(fname):
            os.remove(fname)
        return "no_tags"

    g = geometry
    z['stags'] = [osm.tags2str(t) for t in z.tags]

    # dictionaries in parquet seem to be scrambled
    # when loading do z['tags'] = [eval(t) for t in z.tags]
    z['tags'] = [str(t) for t in z.tags]
    z['chip_id'] = chip_id

    zi = z[[i.intersects(g) for i in z.geometry]]

    
    zi['geometry'] = [i.intersection(g) for i in zi.geometry]
    zic = zi.to_crs(osm.epsg_california)
    zi['area'] = [g.area for g in zic.geometry]
    zi['length'] = [g.length for g in zic.geometry]
    zi.to_parquet(fname)
    return "ok"

In [56]:
# keep only the chip_ids and geometry, delete geodataframe to avoid massive memory transfer between joblib processes
try:
    chip_ids = dgd.index.values.copy()
    geometries = dgd.geometry.values.copy()
    del (dgd)
except:
    pass

In [None]:
r = mParallel(n_jobs=60, verbose=30)(delayed(process_chipid)(c,g) for c, g in zip(chip_ids, geometries))

 [mParallel(n_jobs=60)]: Done 84130 tasks      | elapsed:  2.3min8857422s.) Setting batch_size=4.

In [61]:
pd.Series(r).value_counts()

ok         113160
no_file      6879
Name: count, dtype: int64

In [60]:
1

1

In [65]:
z = gpd.read_parquet("/opt/data/california-worldcover-chips/osm/1110d6448e119.parquet")
z

Unnamed: 0,tags,geometry,kind,length,area,stags,chip_id
10157003169,"{'mountain_pass': 'yes', 'natural': 'saddle'}",POINT (-120.43953 39.50240),node,0.000000,0.0,mountain_pass: yes\n<br>\nnatural: saddle,1110d6448e119
41,{'highway': 'tertiary'},"LINESTRING (-120.43829 39.50052, -120.43858 39...",way,1623.485085,0.0,highway: tertiary,1110d6448e119
87,"{'highway': 'track', 'motor_vehicle': 'designa...","LINESTRING (-120.44100 39.49711, -120.44099 39...",way,1303.715079,0.0,highway: track\n<br>\nmotor_vehicle: designate...,1110d6448e119
144,"{'alt_name': 'Jackson Meadows Road', 'highway'...","LINESTRING (-120.43467 39.49938, -120.43471 39...",way,344.649955,0.0,alt_name: Jackson Meadows Road\n<br>\nhighway:...,1110d6448e119
165,"{'intermittent': 'yes', 'waterway': 'stream'}","LINESTRING (-120.43970 39.49713, -120.43971 39...",way,4.542585,0.0,intermittent: yes\n<br>\nwaterway: stream,1110d6448e119
...,...,...,...,...,...,...,...
4549,"{'highway': 'track', 'tracktype': 'grade5'}","LINESTRING (-120.45136 39.49134, -120.45155 39...",way,255.754090,0.0,highway: track\n<br>\ntracktype: grade5,1110d6448e119
4553,"{'highway': 'track', 'motor_vehicle': 'designa...","LINESTRING (-120.44447 39.49219, -120.44458 39...",way,1210.247728,0.0,highway: track\n<br>\nmotor_vehicle: designate...,1110d6448e119
4554,"{'highway': 'track', 'tracktype': 'grade4'}","LINESTRING (-120.45433 39.50136, -120.45445 39...",way,193.081996,0.0,highway: track\n<br>\ntracktype: grade4,1110d6448e119
5341,"{'highway': 'track', 'motor_vehicle': 'designa...","LINESTRING (-120.43650 39.49494, -120.43638 39...",way,160.315703,0.0,highway: track\n<br>\nmotor_vehicle: designate...,1110d6448e119


In [72]:
# creates metadata file with only surviving chips

osm_folder = f'/opt/data/california-naip-chips/osm/'
chip_ids = [i.split(".")[0] for i in os.listdir(osm_folder)]

In [75]:
dg = gpd.read_parquet("/opt/data/california-naip-chips/california-naip-chips-200k-raw.parquet")
print ("raw number of chips", len(dg))

raw number of chips 198919


In [76]:
dgg = dg.loc[chip_ids]
print ("surviving number of chips", len(dgg))

surviving number of chips 113160


In [77]:
dgg.to_parquet("/opt/data/california-naip-chips/california-naip-chips-100k.parquet")

## legacy


In [34]:
## legacy

def ___process_chipid(chip_id, geometry):
    fname = f"/opt/data/california-naip-chips/osm/{chip_id}.parquet"
    
    z = gpd.read_parquet(fname)
    orig_z = z.copy()
    if len(z)==0:
        return "no_geoms"

    # in case it was processed before
    z['tags'] = [eval(t) if isinstance(t, str) else t for t in z.tags]
    
    z['tags'] = [{k:v for k,v in t.items() if v is not None} for t in z.tags]
    z['tags'] = [osm.clean_tags(t) for t in z.tags]
    z = z[[len(t)>0 for t in z.tags]]
    
    if len(z)==0:
        return "no_tags"

    g = geometry
    z['stags'] = [osm.tags2str(t) for t in z.tags]
    z['chip_id'] = chip_id

    # dictionaries in parquet seem to be scrambled
    # when loading do z['tags'] = [eval(t) for t in z.tags]
    z['tags'] = [str(t) for t in z.tags]
    z['chip_id'] = chip_id
    
    zi = z[[i.intersects(g) for i in z.geometry]]
    zi['geometry'] = [i.intersection(g) for i in zi.geometry]
    zic = zi.to_crs(osm.epsg_california)
    zi['area'] = [g.area for g in zic.geometry]
    zi['length'] = [g.length for g in zic.geometry]
    zi.to_parquet(fname)
    return "ok"