This notebook scrapes satellite images for each leak repair. For each location it gets a NxM rectangle around the leak before and after it was repaired. Then it collated all the data into h5 files and all the metadata into json files.

It takes days to run because of rate limiting on the google earth api. Because of limited satelite coverage you might find matches for only 10% of the leaks.

## Modifying

- make sure google earth is setup
- load leaks, so they pass the asserts
- change params
- run rest of cells

In [1]:
from path import Path
import arrow
import json
import pytz
from pprint import pprint
from tqdm import tnrange, tqdm_notebook as tqdm
import re, os, collections, itertools, uuid, logging
import tempfile
import tables
import shapely

import zipfile
import urllib

import ee
import pyproj
import numpy as np
import scipy as sp
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (15, 5) # bigger plots
plt.style.use('fivethirtyeight')
%matplotlib inline
%precision 4

'%.4f'

In [2]:
helper_dir = str(Path('..').abspath())
if helper_dir not in os.sys.path:
    os.sys.path.append(helper_dir)
    
from leak_helpers.earth_engine import display_ee, get_boundary, tifs2np, bands_s2, download_image

In [3]:
# %load_ext autoreload
# %autoreload 2

In [4]:
# params
crs_grid = 3857 # texas central meters
satellite = 'COPERNICUS/S2'
notebook_name='image_testing_earth_engine_s2-AUTX_v6'
ts=arrow.utcnow().format('YYYYMMDD-HH-mm-ss')
data_dir = Path('../../data/')
bands = bands_s2

# since the lowest res band is 60m and I want to capture neighbours I should get 6+ pixels
pixel_length = 25.0
resolution_min = 10.0 # m
time_bin_delta = 60*60*24*28 # how long before a leak to look (in seconds)
# TODO get closest but let me filter for time

# init
# temp_dir = Path(tempfile.mkdtemp(prefix=notebook_name+'-', suffix='-'+ts))
temp_dir = Path('/tmp/{}'.format(notebook_name))
output_dir = Path('../../data/scraped_satellite_images/downloaded_images_{}_{}'.format(notebook_name,satellite.replace('/','-')))
cache_dir = output_dir.joinpath('cache')
output_dir.makedirs_p()
temp_dir.makedirs_p()
cache_dir.makedirs_p()

logger = logging.getLogger(notebook_name)
# logger.setLevel(logging.WARN)

crs_grid_proj = pyproj.Proj('+init=epsg:%s'%crs_grid)

temp_dir, output_dir, cache_dir

(Path('/tmp/image_testing_earth_engine_s2-AUTX_v6'),
 Path('../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2'),
 Path('../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cache'))

In [5]:
metadata_file = output_dir.joinpath('script_metadata.json')

# write metadata to json
metadata = dict(
    pixel_length=pixel_length,
    resolution_min=resolution_min,
    bands=bands,
    ts=ts,
    notebook_name=notebook_name,
    crs_grid=crs_grid,
    cache_dir=str(cache_dir),
    temp_dir=str(temp_dir),
    output_dir=str(output_dir),
)
json.dump(metadata, open(metadata_file,'w'))

# earth engine

Setup instructions here
- first need to apply for an account and wait ~ 1day
- https://developers.google.com/earth-engine/python_install#setting-up-authentication-credentials

Refs:
- api https://developers.google.com/earth-engine/
- code examples https://code.earthengine.google.com/
- sentinel1 https://developers.google.com/earth-engine/sentinel1
    - `ee.ImageCollection('COPERNICUS/S2_GRD');`
    - `ee.ImageCollection('COPERNICUS/S1_GRD');`
- keras and google earth https://github.com/patrick-dd/landsat-landstats

In [6]:
# test earth-engine setup
from oauth2client import crypt # should have not error
import ee
ee.Initialize() # should give no errors, if so follow instructions


# test
image = ee.Image('srtm90_v4')
assert image.getInfo()=={'type': 'Image', 'properties': {'system:time_start': 950227200000, 'system:asset_size': 18827626666, 'system:time_end': 951177600000}, 'bands': [{'data_type': {'type': 'PixelType', 'max': 32767, 'min': -32768, 'precision': 'int'}, 'crs': 'EPSG:4326', 'id': 'elevation', 'dimensions': [432000, 144000], 'crs_transform': [0.000833333333333, 0.0, -180.0, 0.0, -0.000833333333333, 60.0]}], 'id': 'srtm90_v4', 'version': 1463778555689000}
print('ok')

# ee.Geometry.Point([117.21079620254062, -30.94712385398404])

ok


# Load leaks

In [7]:
# load wa leaks
leaks_ATX = gpd.read_file(data_dir.joinpath('leak_datasets/austin_leaks/derived/austin_leaks-repairs.geojson'))


# they have to be after launch
leaks_ATX = leaks_ATX[pd.to_datetime(leaks_ATX.COMPDTTM)>=pd.Timestamp('23 June 2015')]
len(leaks_ATX)

leaks_ATX['REPO_Date']=leaks_ATX['COMPDTTM']
leaks_ATX['leak_id']=leaks_ATX.OBJECTID.apply(lambda x:'ATX-%s'%x)
leaks=leaks_ATX

In [8]:
# choose one leak for now
leak = leaks.sample()
leak

Unnamed: 0,22,ADDRKEY,CITY,COMPDTTM,DESCRIPT,FullStreetName,INITDTTM,LOC,OBJECTID,PREDIR,...,STNAME,STNO,STSUB,SUFFIX,WONO,ZIP,geometry,id,REPO_Date,leak_id
22152,510083.0,608224.0,AUSTIN,2016-07-25T21:00:00,WATER SERVICE LEAK,2711 HILLVIEW GREEN LN,2016-07-25T21:00:00,,69638,,...,HILLVIEW GREEN,2711,,LN,1767141.0,78703-,POINT (-97.76605986741285 30.30257718194047),69638,2016-07-25T21:00:00,ATX-69638


# Fetching sentinal-1 and sentinel 2 images

For a leak repair, grab the image before and after it

Note roughly 10% have results for a 1 day temporal bin

In [9]:
def get_cached_ids():
    cache_dirs = [str(f.relpath(cache_dir)).split('_')[0] for f in cache_dir.listdir()]
    return cache_dirs

def init_cache(leak_id):
    """We will cache downloads in folders like 'id_after'"""
    if leak_id:
        cache_subdir = cache_dir.joinpath(leak_id+'_after')
        cache_subdir.makedirs_p()
        cache_subdir = cache_dir.joinpath(leak_id+'_before')
        cache_subdir.makedirs_p()
    return get_cached_ids()

For each point
- find the nearest image before the repair
- and the soonest image after repair
- save a part of each with metadata

Later we can filter, interpolate, and read into numpy arrays

In [10]:
distance = resolution_min*(pixel_length/2.0-0.5)

In [11]:
# test with one image
for i in (np.random.sample(5)*len(leaks)).astype(np.int):
    leak=leaks_ATX.iloc[[i]]
    leak_id = str(leak.OBJECTID.values[0])

    repo_date_ts = arrow.get(leak.REPO_Date.values[0]).timestamp
    boundary = get_boundary(leak, distance=distance)
    sentinel2_before = ee.ImageCollection(satellite)\
        .filterBounds(boundary)\
        .filterMetadata('CLOUDY_PIXEL_PERCENTAGE','less_than',30)\
        .filterDate(933828614605,1488776737937)\
        .sort('system:time_start', opt_ascending=False) # first will be latest
    image = ee.Image(sentinel2_before.first()).clip(boundary)
    image.getInfo()
    name=leak_id+'_after'
    path,files=download_image(
        image, 
        scale=resolution_min, 
        crs=crs_grid, 
        name=name,
        cache_dir=cache_dir
    )
    data = tifs2np(path,files,bands=bands)
    print(i,[(d.shape,d.sum()) for d in data])
    for d in data:
        assert d.shape[0]==pixel_length, 'the downloaded image is the wrong size, tweak distance'
        assert d.shape[1]==pixel_length
    assert np.sum(data)!=0, 'should not be empty (make sure you are using the right bands)'

1405 [((25, 25), 928118.0), ((25, 25), 789939.0), ((25, 25), 728441.0), ((25, 25), 705401.0), ((25, 25), 788108.0), ((25, 25), 1023536.0), ((25, 25), 1130338.0), ((25, 25), 1082823.0), ((25, 25), 1213326.0), ((25, 25), 413321.0), ((25, 25), 11618.0), ((25, 25), 1051113.0), ((25, 25), 759413.0), ((25, 25), 0.0)]
663 [((25, 25), 820624.0), ((25, 25), 662623.0), ((25, 25), 588262.0), ((25, 25), 525886.0), ((25, 25), 635569.0), ((25, 25), 947381.0), ((25, 25), 1085808.0), ((25, 25), 1065084.0), ((25, 25), 1191142.0), ((25, 25), 428699.0), ((25, 25), 10894.0), ((25, 25), 935971.0), ((25, 25), 635933.0), ((25, 25), 0.0)]
139 [((25, 25), 807594.0), ((25, 25), 647622.0), ((25, 25), 578721.0), ((25, 25), 503815.0), ((25, 25), 617403.0), ((25, 25), 972375.0), ((25, 25), 1109169.0), ((25, 25), 1083035.0), ((25, 25), 1219875.0), ((25, 25), 444426.0), ((25, 25), 15954.0), ((25, 25), 957910.0), ((25, 25), 644383.0), ((25, 25), 0.0)]
73 [((25, 25), 886434.0), ((25, 25), 755142.0), ((25, 25), 714171.0

In [None]:
cached_ids = get_cached_ids()

# logger = logging.getLogger()
# logger.setLevel(logging.WARN)

# for i in tqdm(range(len(leaks))):
def get_image_for_leak(i, cached_ids=cached_ids):    
    leak = leaks_ATX.iloc[[i]]
    repo_date_ts = arrow.get(leak.COMPDTTM.values[0]).timestamp
    
    
    # crappy way or recording that we tried this one
    leak_id = str(leak.OBJECTID.values[0])
    if leak_id in cached_ids:
        logger.info('Skipping cached download for leak id %s ',leak_id)
        return
    
    boundary = get_boundary(leak, distance=distance) #, epsg=crs_grid)
    
    
    # get image day before    
    sentinel2_before = ee.ImageCollection('COPERNICUS/S2')\
        .filterBounds(boundary)\
        .filterDate((repo_date_ts-time_bin_delta)*1000,(repo_date_ts)*1000)\
        .filterMetadata('CLOUDY_PIXEL_PERCENTAGE','less_than',30)\
        .sort('system:time_start', opt_ascending=False) # first will be latest
    
    results = sentinel2_before.size().getInfo()
    if results<1:
        logger.info('Error no results for day before %s',leak_id)
        cached_ids = init_cache(leak_id) # so we know there where no results
        return
        
    # get image day after
    sentinel2_after = ee.ImageCollection('COPERNICUS/S2')\
        .filterBounds(boundary)\
        .filterDate((repo_date_ts)*1000,(repo_date_ts+time_bin_delta*6)*1000)\
        .filterMetadata('CLOUDY_PIXEL_PERCENTAGE','less_than',30)\
        .sort('system:time_start', opt_ascending=True) # first will be earliest
        
    results = sentinel2_after.size().getInfo()
    if results<1:
        logger.info('Error no results for day after, id %s',leak_id)
        cached_ids = init_cache(leak_id) # so we know there where no results
        return
        
    # download as save images    
    logger.info('results for %s', leak_id)
    image = ee.Image(sentinel2_before.first()).clip(boundary)
    name=leak_id+'_before'
    path,files=download_image(
        image, 
        scale=resolution_min, 
        crs=crs_grid, 
        name=name,
        cache_dir=cache_dir
    )
    # also save metadata so we can filter by date
    with open(path.joinpath('metadata.json'), 'w') as fo:
        metadata = dict(
            image=image.getInfo(),
            scale=resolution_min,
            crs=crs_grid,
            name=name,
            distance=distance,
            leak=json.loads(leak.to_json())
        )
        json.dump(metadata, fo)

    image = ee.Image(sentinel2_after.first()).clip(boundary)
    name=leak_id+'_after'
    path,files=download_image(
        image, 
        scale=resolution_min, 
        crs=crs_grid, 
        name=name,
        cache_dir=cache_dir
    )
    with open(path.joinpath('metadata.json'), 'w') as fo:
        metadata = dict(
            image=image.getInfo(),
            scale=resolution_min,
            crs=crs_grid,
            name=name,
            distance=distance,
            leak=json.loads(leak.to_json())
        )
        json.dump(metadata, fo)
        
    return

for i in tqdm(range(len(leaks_ATX))):
# for i in tqdm(range(3480-10,3480)):
    try:
        get_image_for_leak(i)
    except urllib.error.HTTPError as e:
        print(i,e,'sleep')
        if e.code == 429:
             time.sleep(13);
    except Exception as e:
        print(i,e)
        ee.Initialize() # should give no errors, if so follow instructions
        pass




# load tiffs to arrays

In [None]:
# This loads it as X and y for machine learning, and also time and metadata so we can filter
import shapely
X = []
y = []
t = []
m = []
discarded=[]
for path in tqdm(cache_dir.listdir()):
    files = [file.relpath(path) for file in path.listdir() if file.endswith('.tif')]
    if files:
        # check metadata
        try:
            metadata = json.load(open(path.joinpath('metadata.json')))
        except (FileNotFoundError, ValueError) as e:
            path.move(path.replace(path.basename(),'.deleteme-'+str(uuid.uuid4())))
            if '_after_' in path: # also delete the before path                
                path_after = Path(path.replace('_after_','_before_'))
                if path_after.isdir():
                    path_after.move(path.replace(path.basename(),'.deleteme-'+str(uuid.uuid4())))
            logger.error('Invalid metadata.json, deleted folder %s, please rerun scraping cell to rescrape this image', path)
            continue
        
        # e.g. lets filter it so "before" image are only 1 day before
        if '_before_' in path.basename():
            yy = True
        else:
            yy = False
        
        # work out time gap too
        t1 = arrow.get(metadata['image']['properties']['system:time_end']/1000)
        t0 = arrow.get(metadata['leak']['features'][0]['properties']['REPO_Date'])
        td=t1-t0
        tt = td.total_seconds()
        
        # load data
        data = tifs2np(path,files,bands=bands)
             
        # check we don't have empty bands 1-13
        empty_bands = np.array([d.sum() for d in data])==0
        
        # lets check we didn't get the edge of an image
        bbox = np.array(metadata['image']['properties']['system:footprint']['coordinates'][0])
        loc = metadata['leak']['features'][0]['geometry']['coordinates']
        minx=bbox[:,0].min()
        maxx=bbox[:,0].max()
        miny=bbox[:,1].min()
        maxy=bbox[:,1].max()
        bbox_shp = shapely.geometry.box(
            minx=minx,
            maxx=maxx,
            miny=miny,
            maxy=maxy
        )
        loc_shp = shapely.geometry.Point(loc[0],loc[1])
        shapely.geometry.GeometryCollection([bbox_shp, loc_shp])
        try:
            assert loc_shp.intersects(bbox_shp), 'leak location should be inside image'
            assert bbox_shp.centroid.almost_equals(loc_shp, decimal=5), 'leak should be near center of image'
            assert (np.array([d.shape for d in data])==pixel_length).all(), 'image area should be the right amount of pixels'
            assert (maxx-minx)/(maxy-miny)<1.3, 'should be roughly square'
            assert (maxx-minx)/(maxy-miny)>0.7, 'should be roughly square'
            assert not empty_bands.all(), 'should not have all bands empty'
        except Exception as exc:
            print(path, exc)
#             raise(exc)
            discarded.append(path)
        else:
            X.append(data)
            y.append(yy)
            t.append(tt)
            m.append(metadata)
        

len(X), len(discarded)

Invalid metadata.json, deleted folder ../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cache/.deleteme-4fe1c440-d4db-472c-a06e-82c22095ef80, please rerun scraping cell to rescrape this image
Invalid metadata.json, deleted folder ../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cache/.deleteme-9bbfcf72-c2a7-4806-8350-8a5eadd2b7e4, please rerun scraping cell to rescrape this image
Invalid metadata.json, deleted folder ../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cache/.deleteme-ba39cc06-fa6b-4c55-81c0-1d01ce78563a, please rerun scraping cell to rescrape this image
Invalid metadata.json, deleted folder ../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cache/.deleteme-d8f62bac-7fad-4254-ae31-72552c895208, please rerun scraping cell to rescrape this image
Invalid metadata.json, deleted folder ../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cach

../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cache/67111_before_3857_10.0 should not have all bands empty
../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cache/69704_before_3857_10.0 leak location should be inside image
../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cache/68023_before_3857_10.0 should not have all bands empty
../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cache/68292_after_3857_10.0 leak location should be inside image
../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cache/68292_before_3857_10.0 leak location should be inside image
../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cache/68539_after_3857_10.0 leak should be near center of image
../data/downloaded_images_image_testing_earth_engine_s2-AUTX_v6_COPERNICUS-S2/cache/68539_before_3857_10.0 leak should be near center of image
../dat

In [None]:
# some leaks say REPO_data some say COMPDTTM I'll unify them
for mm in m:
    props = mm['leak']['features'][0]['properties']
    if 'REPO_Date' not in props:
        props['REPO_Date']=props['COMPDTTM']
    if 'leak_id' not in props:
        props['leak_id']='AU_%s'%props['id']

In [None]:
# # if there is an error rename the dir
# 2497 2739
# newname = path.basename().replace('_','-deleteme-')
# newpath=Path('/tmp/').joinpath(newname)
# print(path.basename(),newname)
# path.rename(newname)
# path.move(newpath)

In [None]:
# shuffle
from sklearn.utils import shuffle
X,y,m,t = shuffle(X,y,m,t,random_state=1337)

In [None]:
# save using hdf5 (so keras can easily load it) and json 
import h5py
h5file = output_dir.joinpath('data.h5')
with h5py.File(h5file, 'w') as h5f:
    h5f.create_dataset('X', data=X)
    h5f.create_dataset('y', data=y)
    h5f.create_dataset('t', data=t)

json.dump(m,open(output_dir.joinpath('data_metadata.json'),'w'))

with open(output_dir.joinpath('readme.md'),'w') as fo:
    fo. write("""
Files:
- ee_ee_scraping_earth_engine_sentinel_2-austin_leaks- cached tiff files
- script_metadata.json - information on scraping script
- data.h5 contains X, y, and t.
    - X: tiff files for each band loaded into an array of shape (Leak, Bands, width, length)
    - y: True for before the leak, False for after
    - t: time before leak (can be negative) in seconds
- data_metadata: array of metadata for each leak in X. Each contain info on leak, image, and image search
    
Loading: 
```py
# load
metadatas = json.load(open('data_metadata.json'))
with h5py.File('data.h5','r') as h5f:
    X2 = h5f['X'][:]
    y2 = h5f['y'][:]
    t2 = h5f['t'][:]
y
```
    """)

In [None]:
# test load
metadatas = json.load(open(output_dir.joinpath('data_metadata.json')))
with h5py.File(output_dir.joinpath('data.h5'),'r') as h5f:
    X2 = h5f['X'][:]
    y2 = h5f['y'][:]
    t2 = h5f['t'][:]
X2.shape, y2, t2, metadatas[0].keys()

In [None]:
output_dir