This notebook scrapes satellite images for each leak repair. For each location it gets a NxM rectangle around the leak before and after it was repaired. Then it collated all the data into h5 files and all the metadata into json files.

It takes days to run because of rate limiting on the google earth api. Because of limited satelite coverage you might find matches for only 10% of the leaks.

## Modifying

- make sure google earth is setup
- load leaks, so they pass the asserts
- change params
- run rest of cells

In [1]:
from path import Path
import arrow
import json
import pytz
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
import re, os, collections, itertools, uuid, logging
import tempfile
import tables

import zipfile
import urllib

import ee
import pyproj
import numpy as np
import scipy as sp
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (15, 5) # bigger plots
plt.style.use('fivethirtyeight')
%matplotlib inline
%precision 4

'%.4f'

In [2]:
# %load_ext autoreload
# %autoreload 2

In [3]:
helper_dir = str(Path('..').abspath())
if helper_dir not in os.sys.path:
    os.sys.path.append(helper_dir)
    
from leak_helpers.earth_engine import display_ee, get_boundary, tifs2np, bands_s2, download_image, bands_s2, bands_s1

In [4]:

crs_grid = 3857
notebook_name='testing_earth_engine-s1-AUTX_v3'
ts=arrow.utcnow().format('YYYYMMDD-HH-mm-ss')
data_dir = Path('../../data/')
bands = bands_s1

# since the lowest res band is 60m and I want to capture neighbours I should get 6+ pixels
pixel_length = 25.0
resolution_min = 10.0 # m
time_bin_delta = 60*60*24*28 # how long before a leak to look (in seconds)
# TODO get closest but let me filter for time

# init
temp_dir = Path(tempfile.mkdtemp(prefix=notebook_name+'-', suffix='-'+ts))
# output_dir = data_dir.joinpath('{ts:}_{notebook_name:}'.format(ts=ts,notebook_name=notebook_name))
output_dir = Path('../../data/scraped_satellite_images/20170314-05-26-52_testing_earth_engine-s1-AUTX_v3')
cache_dir = output_dir.joinpath('ee_S1_AUTX-leaks_cache_v3')

output_dir.makedirs_p()
temp_dir.makedirs_p()
cache_dir.makedirs_p()

logger = logging.getLogger(notebook_name)
logger.setLevel(logging.WARN)

crs_grid_proj = pyproj.Proj('+init=epsg:%s'%crs_grid)

temp_dir, output_dir, cache_dir

(Path('/tmp/testing_earth_engine-s1-AUTX_v3-v5sl3376-20170315-09-09-15'),
 Path('../data/20170314-05-26-52_testing_earth_engine-s1-AUTX_v3'),
 Path('../data/20170314-05-26-52_testing_earth_engine-s1-AUTX_v3/ee_S1_AUTX-leaks_cache_v3'))

In [5]:
# open hdf5 data files
# X_file = output_dir.joinpath('X_train.hdf5')
# y_file = output_dir.joinpath('y_train.hdf5')
metadata_file = output_dir.joinpath('metadata.json')

# write metadata to json
metadata = dict(
    pixel_length=pixel_length,
    resolution_min=resolution_min,
    bands=bands,
    ts=ts,
    notebook_name=notebook_name,
    crs_grid=crs_grid,
    cache_dir=str(cache_dir),
    temp_dir=str(temp_dir),
    output_dir=str(output_dir),
)
json.dump(metadata, open(metadata_file,'w'))

# write headers

# with tables.open_file(X_file, 'w') as xfo:
#     atom = tables.Atom.from_dtype(np.dtype('Float32'))
#     # create an expandable array
#     data_storage = xfo.create_earray(xfo.root, 'data', atom, (0,len(bands),pixel_length,pixel_length))
                                     

# # write headers for y
# with tables.open_file(y_file, 'w') as xfo:
#     atom = tables.Atom.from_dtype(np.dtype('int'))
#     # create an expandable array
#     data_storage = xfo.create_earray(xfo.root, 'data', atom, (0,))
                                    

# earth engine

Setup instructions here
- first need to apply for an account and wait ~ 1day
- https://developers.google.com/earth-engine/python_install#setting-up-authentication-credentials

Refs:
- api https://developers.google.com/earth-engine/
- code examples https://code.earthengine.google.com/
- sentinel1 https://developers.google.com/earth-engine/sentinel1
    - `ee.ImageCollection('COPERNICUS/S2_GRD');`
    - `ee.ImageCollection('COPERNICUS/S1_GRD');`
- keras and google earth https://github.com/patrick-dd/landsat-landstats

In [6]:
# test earth-engine setup
from oauth2client import crypt # should have not error
import ee
ee.Initialize() # should give no errors, if so follow instructions


# test
image = ee.Image('srtm90_v4')
assert image.getInfo()=={'type': 'Image', 'properties': {'system:time_start': 950227200000, 'system:asset_size': 18827626666, 'system:time_end': 951177600000}, 'bands': [{'data_type': {'type': 'PixelType', 'max': 32767, 'min': -32768, 'precision': 'int'}, 'crs': 'EPSG:4326', 'id': 'elevation', 'dimensions': [432000, 144000], 'crs_transform': [0.000833333333333, 0.0, -180.0, 0.0, -0.000833333333333, 60.0]}], 'id': 'srtm90_v4', 'version': 1463778555689000}
print('ok')

# ee.Geometry.Point([117.21079620254062, -30.94712385398404])

ok


# Load leaks

In [7]:
# load wa leaks
leaks_ATX = gpd.read_file(data_dir.joinpath('leak_datasets/austin_leaks/derived/austin_leaks-repairs.geojson'))


# they have to be after launch
s3_launch_ts=pd.Timestamp('3 Oct 2014')
leaks_ATX = leaks_ATX[pd.to_datetime(leaks_ATX.COMPDTTM)>=s3_launch_ts]
len(leaks_ATX)

4796

In [8]:
# choose one leak for now
leak = leaks_ATX.sample()
leak

Unnamed: 0,22,ADDRKEY,CITY,COMPDTTM,DESCRIPT,FullStreetName,INITDTTM,LOC,OBJECTID,PREDIR,QTYCALLS,STNAME,STNO,STSUB,SUFFIX,WONO,ZIP,geometry,id
21545,489752.0,387141.0,AUSTIN,2016-05-02T23:15:00,WATER SERVICE LEAK,4701 WILD BRIAR PASS,2016-05-02T17:11:00,,69032,,2,WILD BRIAR,4701,,PASS,1746780.0,78746-,POINT (-97.82243430021848 30.27710674017273),69031


In [9]:
leaks_ATX['REPO_Date']=leaks_ATX['COMPDTTM']
leaks_ATX['leak_id']=leaks_ATX.OBJECTID.apply(lambda x:'ATX_%s'%x)
# # some leaks say REPO_data some say COMPDTTM I'll unify them
# for mm in m:
#     props = mm['leak']['features'][0]['properties']
#     if 'REPO_Date' not in props:
#         props['REPO_Date']=props['COMPDTTM']
#     if 'leak_id' not in props:
#         props['leak_id']='AU_%s'%props['id']
# leaks_ATX

# Fetching sentinal-1 and sentinel 2 images

For a leak repair, grab the image before and after it

Note roughly 10% have results for a 1 day temporal bin

In [10]:
def get_cached_ids():
    cache_dirs = [str(f.relpath(cache_dir)).split('_')[0] for f in cache_dir.listdir()]
    return cache_dirs

def init_cache(leak_id):
    """We will cache downloads in folders like 'id_after'"""
    if leak_id:
        cache_subdir = cache_dir.joinpath(leak_id+'_after')
        cache_subdir.makedirs_p()
        cache_subdir = cache_dir.joinpath(leak_id+'_before')
        cache_subdir.makedirs_p()
    return get_cached_ids()

For each point
- find the nearest image before the repair
- and the soonest image after repair
- save a part of each with metadata

Later we can filter, interpolate, and read into numpy arrays

In [11]:
distance = resolution_min*(pixel_length/2.0-0.5)

In [12]:


# test with one image
for i in [10,50,1000,2000]:
    leak=leaks_ATX.iloc[[i]]
    leak_id = str(leak.OBJECTID.values[0])

    repo_date_ts = arrow.get(leak.REPO_Date.values[0]).timestamp
    boundary = get_boundary(leak, distance=distance)
    sentinel2_before = ee.ImageCollection('COPERNICUS/S1_GRD')\
        .filterBounds(boundary)\
        .filterDate(933828614605,1488776737937)\
        .sort('system:time_start', opt_ascending=False) # first will be latest
    image = ee.Image(sentinel2_before.first()).clip(boundary)
    image.getInfo()
    name=leak_id+'_after'
    path,files=download_image(
        image, 
        scale=resolution_min, 
        crs=crs_grid, 
        name=name,
        cache_dir=cache_dir
    )
    data = tifs2np(path,files,bands=bands_s1)
    print(i,[(d.shape,d.sum()) for d in data])
        assert d.shape[0]==pixel_length, 'the downloaded image is the wrong size, tweak distance'
        assert d.shape[1]==pixel_length
    assert np.sum(data)!==0,'should not be empty (make sure you are using the right bands)'

10 [((25, 25), -4167.8394), ((25, 25), 0.0), ((25, 25), -8604.7656), ((25, 25), 0.0), ((25, 25), 25723.6)]
50 [((25, 25), -5057.6924), ((25, 25), 0.0), ((25, 25), -7674.0732), ((25, 25), 0.0), ((25, 25), 25614.271)]
1000 [((25, 25), -6142.1606), ((25, 25), 0.0), ((25, 25), -10457.014), ((25, 25), 0.0), ((25, 25), 25792.426)]
2000 [((25, 25), -5779.6787), ((25, 25), 0.0), ((25, 25), -8787.1338), ((25, 25), 0.0), ((25, 25), 25433.117)]


In [12]:
cached_ids = get_cached_ids()

logger = logging.getLogger()
logger.setLevel(logging.WARN)

def get_image_for_leak(i, cached_ids=cached_ids):    
    leak = leaks_ATX.iloc[[i]]
    repo_date_ts = arrow.get(leak.REPO_Date.values[0]).timestamp
    
    
    # crappy way or recording that we tried this one
    leak_id = str(leak.OBJECTID.values[0])
    if leak_id in cached_ids:
        logger.info('Skipping cached download for leak id %s ',leak_id)
        return
    
    boundary = get_boundary(leak.geometry, distance=distance)
    
    # get image day before    
    sentinel2_before = ee.ImageCollection('COPERNICUS/S1_GRD')\
        .filterBounds(boundary)\
        .filterDate((repo_date_ts-time_bin_delta)*1000,(repo_date_ts)*1000)\
        .sort('system:time_start', opt_ascending=False) # first will be latest
    
    results = sentinel2_before.size().getInfo()
    if results<1:
        logger.info('Error no results for day before %s',leak_id)
        cached_ids = init_cache(leak_id) # so we know there where no results
        return
        
    # get image day after
    sentinel2_after = ee.ImageCollection('COPERNICUS/S1_GRD')\
        .filterBounds(boundary)\
        .filterDate((repo_date_ts)*1000,(repo_date_ts+time_bin_delta*6)*1000)\
        .sort('system:time_start', opt_ascending=True) # first will be earliest
        
    results = sentinel2_after.size().getInfo()
    if results<1:
        logger.info('Error no results for day after, id %s',leak_id)
        cached_ids = init_cache(leak_id) # so we know there where no results
        return
        
    # download as save images    
    logger.info('results for %s', leak_id)
    image = ee.Image(sentinel2_before.first()).clip(boundary)
    name=leak_id+'_before'
    path,files=download_image(
        image, 
        scale=resolution_min, 
        crs=crs_grid, 
        name=name,
        cache_dir=cache_dir
    )
    # also save metadata so we can filter by date
    with open(path.joinpath('metadata.json'), 'w') as fo:
        metadata = dict(
            image=image.getInfo(),
            scale=resolution_min,
            crs=crs_grid,
            name=name,
            distance=distance,
            leak=json.loads(leak.to_json())
        )
        json.dump(metadata, fo)

    image = ee.Image(sentinel2_after.first()).clip(boundary)
    name=leak_id+'_after'
    path,files=download_image(
        image, 
        scale=resolution_min, 
        crs=crs_grid, 
        name=name,
        cache_dir=cache_dir
    )
    with open(path.joinpath('metadata.json'), 'w') as fo:
        metadata = dict(
            image=image.getInfo(),
            scale=resolution_min,
            crs=crs_grid,
            name=name,
            distance=distance,
            leak=json.loads(leak.to_json())
        )
        json.dump(metadata, fo)
        
for i in tqdm(range(len(leaks_ATX))):
    try:
        get_image_for_leak(i)
    except Exception as e:
        print('exception running get_image_for_leak:',e)
        ee.Initialize()




# parsing tiffs

In [37]:
import shapely
X = []
y = []
m = []
t = []
discarded = []
for path in tqdm(cache_dir.listdir()):
    files = [file.relpath(path) for file in path.listdir() if file.endswith('.tif')]
    if files:
        # check metadata
        try:
            metadata = json.load(open(path.joinpath('metadata.json')))
        except Exception as e:
            print(path, e)
            raise(e)
        # e.g.
        if '_before_' in path.basename():
            yy = True
        else:
            yy = False
            
        t1 = arrow.get(metadata['image']['properties']['system:time_end']/1000)
        t0 = arrow.get(metadata['leak']['features'][0]['properties']['REPO_Date'])
        td=t1-t0
        tt = td.total_seconds()       
        
        # load data
        data = tifs2np(path,files,bands=bands_s1)
        
        empty_bands = (data.sum(-1).sum(-1)==0).sum()

        # now if the size is wrong let's interp it
#         if data.shape[-2]!=pixel_length or data.shape[-1]!=pixel_length:
#             data = np.array([sp.misc.imresize(x,size=(pixel_length,pixel_length),interp='cubic', mode='F') for x in data])
                  
        # lets check we didn't get the edge of an image
        bbox = np.array(metadata['image']['properties']['system:footprint']['coordinates'][0])
        loc = metadata['leak']['features'][0]['geometry']['coordinates']
        minx=bbox[:,0].min()
        maxx=bbox[:,0].max()
        miny=bbox[:,1].min()
        maxy=bbox[:,1].max()
        bbox_shp = shapely.geometry.box(
            minx=minx,
            maxx=maxx,
            miny=miny,
            maxy=maxy
        )
        loc_shp = shapely.geometry.Point(loc[0],loc[1])
        shapely.geometry.GeometryCollection([bbox_shp, loc_shp])
        try:
            assert loc_shp.intersects(bbox_shp), 'leak location should be inside image'
            assert bbox_shp.centroid.almost_equals(loc_shp, decimal=5), 'leak should be near center of image'
#             assert bbox_shp.area>4.5e-06, 'image area should be the right amount of pixels'
            assert (maxx-minx)/(maxy-miny)<1.3, 'should be roughly square'
            assert (maxx-minx)/(maxy-miny)>0.7, 'should be roughly square'
            assert empty_bands<3, 'non qc bands should not be empty'
        except Exception as exc:
            print(path, exc)
            discarded.append(path)
        else:
            X.append(data)
            y.append(yy)
            t.append(tt)
            m.append(metadata)
        

X = np.array(X)
y = np.array(y)
len(X)




3696

In [14]:
# # for broken dirs
# path.rename(output_dir.joinpath('deleteme'+str(uuid.uuid4())))

In [15]:
# shuffle
from sklearn.utils import shuffle
X,y,m= shuffle(X,y,m,random_state=1337)

In [16]:
path

Path('../data/20170314-05-26-52_testing_earth_engine-s1-AUTX_v3/ee_S1_AUTX-leaks_cache_v3/70617_before_3857_10.0')

In [17]:
# # which bands do we have?
# a=np.array([x.sum(-1).sum(-1)==0 for x in X])
# print('amount of each band',list(zip(bands_s1,a.sum(0))))
# print('mean amount of bands',a.sum(1).mean())

In [18]:
# what resolutions? all 10m, that's good!
collections.Counter([mm['image']['properties']['resolution_meters'] for mm in m])

Counter({10.0000: 3696})

In [19]:
# save using hdf5 (so keras can easily load it) and json 
import h5py
h5file = output_dir.joinpath('data.h5')
with h5py.File(h5file, 'w') as h5f:
    h5f.create_dataset('X', data=X)
    h5f.create_dataset('y', data=y)
#     h5f.create_dataset('t', data=t)

json.dump(m,open(output_dir.joinpath('data_metadata.json'),'w'))

with open(output_dir.joinpath('readme.md'),'w') as fo:
    fo. write("""
Files:
- ee_S1_AUTX-leaks_cache- cached tiff files
- script_metadata.json - information on scraping script
- data.h5 contains X, y, and t.
    - X: tiff files for each band loaded into an array of shape (Leak, Bands, width, length)
    - y: True for before the leak, False for after
    - t: time before leak (can be negative) in seconds
- data_metadata: array of metadata for each leak in X. Each contain info on leak, image, and image search
    
Loading: 
```py
# load
metadatas = json.load(open('data_metadata.json'))
with h5py.File('data.h5','r') as h5f:
    X2 = h5f['X'][:]
    y2 = h5f['y'][:]
    t2 = h5f['t'][:]
y
```
    """)

In [20]:
# test load
metadatas = json.load(open(output_dir.joinpath('data_metadata.json')))
with h5py.File(output_dir.joinpath('data.h5'),'r') as h5f:
    X2 = h5f['X'][:]
    y2 = h5f['y'][:]
X2.shape, y2, metadatas[0].keys()

((3696, 5, 25, 25),
 array([ True, False,  True, ..., False,  True, False], dtype=bool),
 dict_keys(['name', 'leak', 'image', 'distance', 'scale', 'crs']))

# test deleteme

In [21]:
os.environ['CUDA_VISIBLE_DEVICES']="" # to disable gpu, so I can do large predictions in memory

helper_dir = str(Path('.').abspath())
if helper_dir not in os.sys.path:
    os.sys.path.append(helper_dir)

from leak_helpers.earth_engine import display_ee, get_boundary, tifs2np, bands_s2, download_image, bands_s2
from leak_helpers.geometry import diffxy, resample_polygon
from leak_helpers.modelling import ImageDataGenerator, dice_coef_loss
from leak_helpers.visualization import imshow_bands
from leak_helpers.analysis import parse_classification_report, find_best_dummy_classification, calculate_result_class
from leak_helpers.modelling.filters import is_not_cloudy, is_not_center_cloudy, is_image_within, is_leak, filter_split_data, is_not_dup, hash_rows, normalise_bands

Using TensorFlow backend.


In [22]:
md=m[2]
t_image = arrow.get(md['image']['properties']['system:time_end'] / 1000)
t_leak = arrow.get(md['leak']['features'][0]['properties']['REPO_Date'])
seconds_before_leak = (t_leak - t_image).total_seconds()
seconds_before_leak#<60*60*24*4
# [is_image_within(mm, 60*60*24*4) for mm in m]

687040.0200

In [46]:
a = has_three_bands = np.array([x.sum(-1).sum(-1)==0 for x in X]).sum(1)>1
b = [is_image_within(mm,60*60*24*3) for mm in m]
c = is_not_dup(X)
keep = a&b&c

X = X[keep]
y = y[keep]
m = [m[i] for i in range(len(m)) if keep[i]]
print('kept',keep.sum(),'of',len(keep))


kept 2176 of 3696


In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, metadata_train, metadata_test = train_test_split(
        X, y, m)

X_train2 = X_train.reshape((len(X_train),-1))
X_test2 = X_test.reshape((len(X_test),-1))

In [49]:
target_names = ['noleak','leak']
df_dummies, best_dummy = find_best_dummy_classification(X,y,n=50, target_names=target_names)
df_dummies

  mcc = cov_ytyp / np.sqrt(var_yt * var_yp)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,report,strategy,score,matthews_corrcoef
207,precision recall f1-score supp...,classifier_stratified,0.675345,0.091104
15,precision recall f1-score supp...,classifier_stratified,0.690658,0.087100
25,precision recall f1-score supp...,classifier_uniform,0.493109,0.077541
157,precision recall f1-score supp...,classifier_uniform,0.486983,0.074884
81,precision recall f1-score supp...,classifier_stratified,0.675345,0.069670
255,precision recall f1-score supp...,classifier_stratified,0.683002,0.060724
235,precision recall f1-score supp...,classifier_uniform,0.502297,0.058087
175,precision recall f1-score supp...,classifier_uniform,0.502297,0.053772
199,precision recall f1-score supp...,classifier_uniform,0.520674,0.053003
133,precision recall f1-score supp...,classifier_uniform,0.491577,0.050116


In [54]:
import sklearn.ensemble
thresh=0.5
clf = sklearn.ensemble.RandomForestClassifier(
    n_estimators=200, 
    criterion='entropy',
#     max_depth=None, 
    min_samples_split=6, 
    min_samples_leaf=6,
#     max_features='auto', 
    bootstrap=True,
#         random_state=0,
#     n_jobs=4, 
)

clf.fit(X_train2, y_train) 

y_pred = clf.predict(X_test2)
score = clf.score(X_test2, y_test)

matthews_corrcoef = sklearn.metrics.matthews_corrcoef(y_test>thresh, y_pred>thresh)
print(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names))
score,matthews_corrcoef

             precision    recall  f1-score   support

     noleak       0.79      1.00      0.88       429
       leak       0.00      0.00      0.00       115

avg / total       0.62      0.79      0.70       544



  mcc = cov_ytyp / np.sqrt(var_yt * var_yp)
  'precision', 'predicted', average, warn_for)


(0.7886, 0.0000)