Initial imports.

In [448]:
from osgeo import osr
import datacube
from datetime import datetime
from madmex.models import Object
import os
import django
import math
import json
from rasterio.features import rasterize
from django.contrib.gis.geos import Polygon, Point
from affine import Affine
import scipy.ndimage
from sklearn.ensemble import RandomForestClassifier
from datacube.storage import masking
import numpy    
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "madmex.settings")
django.setup()

Create a workspace for our test.

In [422]:
dc = datacube.Datacube(app = 'load_test')

Two points that define a bounding box.

In [425]:
min_lon = -102.925
min_lat = 20.913
max_lon = -102.92
max_lat = 20.918

#min_lon = -103.0
#min_lat = 20.9
#max_lon = -102.9
#max_lat = 21.0

In [462]:
sr = dc.load(product='ls8_espa_mexico', 
             longitude=(min_lon, max_lon), 
             latitude=(min_lat, max_lat),
             time=(datetime(2017, 4, 1), datetime(2017, 5, 1)), 
             group_by='solar_day')
sr.geobox

GeoBox(18, 19, Affine(30.0, 0.0, 2404230.0,
       0.0, -30.0, 996080.0), PROJCS["unnamed",GEOGCS["WGS 84",DATUM["unknown",SPHEROID["WGS84",6378137,6556752.3141]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["standard_parallel_1",17.5],PARAMETER["standard_parallel_2",29.5],PARAMETER["latitude_of_origin",12],PARAMETER["central_meridian",-102],PARAMETER["false_easting",2500000],PARAMETER["false_northing",0]])

We use our bounding box to create a polygon that will be used to perform a query and retrieve objects from the database that fall in it (the bounding box is small so its easier to inspect what is happening). 

In [426]:
query_polygon = Polygon(((min_lon, max_lat), 
                         (max_lon, max_lat), 
                         (max_lon, min_lat), 
                         (min_lon, min_lat), 
                         (min_lon, max_lat)))

We define two points that will be used to create the affine transform. To create the points a srid must be specified.

In [427]:
ul_point = Point(min_lon, max_lat, srid=4326)
br_point = Point(max_lon, min_lat, srid=4326)

print(query_polygon.touches(ul_point))
print(query_polygon.touches(br_point))

print(ul_point.transform(sr.crs.wkt,clone=True))
print(br_point.transform(sr.crs.wkt,clone=True))




True
True
POINT (2404243.676082415 996068.6591963242)
POINT (2404757.716297808 995511.2139618103)


Load the objects from the database filtering with the bounding box that we defined earlier. Then the objects are ennumerated and tuples (geojson, index) are added to an empty array called shapes. The indexes are shifted by one because the default fill of the rasterize function is 0 so we don't want to have an object indexed with the 0 tag. In another array we save the tags relative to each object, in this case we are using the "level_1" tag.

In [430]:
tag_key = "level_1"
shapes = []
tags = []
for p, obj in enumerate(Object.objects.filter(the_geom__contained=query_polygon)):
    shapes.append((json.loads(obj.the_geom.transform(sr.crs.wkt,clone=True).geojson), p + 1))
    for tag in obj.tags.all():
        if tag.key == tag_key:
            tags.append(tag.value)

print(len(shapes)) 

(0.003) SELECT "madmex_object"."id", "madmex_object"."the_geom"::bytea, "madmex_object"."added" FROM "madmex_object" WHERE "madmex_object"."the_geom" @ ST_GeomFromEWKB('\x0103000020e610000001000000050000003333333333bb59c05eba490c02eb34407b14ae47e1ba59c05eba490c02eb34407b14ae47e1ba59c07d3f355ebae934403333333333bb59c07d3f355ebae934403333333333bb59c05eba490c02eb3440'::bytea); args=(<django.contrib.gis.db.backends.postgis.adapter.PostGISAdapter object at 0x7f8262c76d68>,)
(0.001) SELECT "madmex_tag"."id", "madmex_tag"."key", "madmex_tag"."value" FROM "madmex_tag" INNER JOIN "madmex_object_tags" ON ("madmex_tag"."id" = "madmex_object_tags"."tag_id") WHERE "madmex_object_tags"."object_id" = 85840; args=(85840,)
(0.001) SELECT "madmex_tag"."id", "madmex_tag"."key", "madmex_tag"."value" FROM "madmex_tag" INNER JOIN "madmex_object_tags" ON ("madmex_tag"."id" = "madmex_object_tags"."tag_id") WHERE "madmex_object_tags"."object_id" = 85841; args=(85841,)
(0.001) SELECT "madmex_tag"."id", "madmex_t

(0.001) SELECT "madmex_tag"."id", "madmex_tag"."key", "madmex_tag"."value" FROM "madmex_tag" INNER JOIN "madmex_object_tags" ON ("madmex_tag"."id" = "madmex_object_tags"."tag_id") WHERE "madmex_object_tags"."object_id" = 85915; args=(85915,)
(0.001) SELECT "madmex_tag"."id", "madmex_tag"."key", "madmex_tag"."value" FROM "madmex_tag" INNER JOIN "madmex_object_tags" ON ("madmex_tag"."id" = "madmex_object_tags"."tag_id") WHERE "madmex_object_tags"."object_id" = 85916; args=(85916,)
(0.001) SELECT "madmex_tag"."id", "madmex_tag"."key", "madmex_tag"."value" FROM "madmex_tag" INNER JOIN "madmex_object_tags" ON ("madmex_tag"."id" = "madmex_object_tags"."tag_id") WHERE "madmex_object_tags"."object_id" = 85926; args=(85926,)
(0.001) SELECT "madmex_tag"."id", "madmex_tag"."key", "madmex_tag"."value" FROM "madmex_tag" INNER JOIN "madmex_object_tags" ON ("madmex_tag"."id" = "madmex_object_tags"."tag_id") WHERE "madmex_object_tags"."object_id" = 85927; args=(85927,)
(0.001) SELECT "madmex_tag"."id"

40


An affine transform is defined to feed the rasterize function. It is needed to provide with dimension and resolution to the output mask. The coordinates that define our transformation must be in the same projection as our data, in this case I take the crs from our datacube data, and get it's well known text representation to transform our points.

In [432]:
ul_point = Point(min_lon, max_lat, srid=4326).transform(sr.crs.wkt,clone=True)
br_point = Point(max_lon, min_lat, srid=4326).transform(sr.crs.wkt,clone=True)

size_x = sr.dims['x']
size_y = sr.dims['y']

ulx = ul_point[0]
uly = ul_point[1]
brx = br_point[0]
bry = br_point[1]

shifted_affine = Affine((brx - ulx) / size_x, 0, ulx, 0, (bry-uly) / size_y, uly)

The objects are rasterized into a numpy array. Should be noticed that we have less objects after the rasterizing. I am not sure what is happening, I tested several things and my strongest hipotesis is that at this resolution, some objects overlap and they are overwritten depending on the moment in which the values are burnt.

In [445]:
print('number of objects from the database: %s' % len(shapes))
mask = rasterize(shapes, out_shape=(size_x,size_y), transform = shifted_affine, all_touched=False)
print('number of objects from the mask: %s' % len(numpy.unique(mask)))
print(mask)

number of objects from the database: 40
number of objects from the mask: 34
[[ 0  0  0  6  0  0  0  0  0 14  0  0  0 12  0  0  0  0  0]
 [ 0  0  6  6  0  0  0  0 14  0  0 12 12 12 12  0  0  0  0]
 [ 0  0  1  1  0  0  0  3 14  2  0  0  0 12 15  0 15  0  0]
 [ 5 13 29  0  0  0  0  3 14  0  0  0 12 15 12  0  0  0  0]
 [ 5  5 29  0 22 22 22  0  0  0 21  0  0 15 15  0  0  0  0]
 [ 5  0  0 29 22 22 23  0  0  0  0 21 21  0 15 15  0  0  0]
 [ 0  0  0 29 24 24 23  0  0  0 21 21  0  0  0  0  0  0  0]
 [ 0  0  0  0 29 24 24 24 24 39 39  7 11 11  8  8  0  0  0]
 [ 0  0  0 29 25 25 29 24 10 10 39  9 11 32 31 31 39  0  0]
 [ 0  0  0 29 25  0 19 39 24 10 39 32 32 32 31 39 39  0  0]
 [ 0  0  0 29  0  0 39 28 28 28 39 39 32 31 39 39 39  0  0]
 [ 0  0  0  0 18  0 19 39 28 39 32 32 32 31 39 39 39  0  0]
 [ 0  0  0 18 18  0 19 19 39 39 39 32 32 30 39  0  0  0  0]
 [ 0  0  0  0 35 35 35 35 19 39 39 32 32 32 39 33  0  0  0]
 [ 0  0  0  0  0 35  0  0 35 39 34 32 39 39 33 33  0  0  0]
 [ 0 38 38  0 35 35  0  

Remember that we shifted the object indexes by one? When we retrieve the tags from our array we need to shift back to get the values. (This is a bit hacky, there should be another way)

In [None]:
new_tags = [tags[index-1] for index in numpy.unique(mask)]

We build our X and y objects by masking the surface reflectance object using the pixel_qa and taking the mean of the pixels using the time dimension. We then apply the zonal statistics for the mean using the mask that we got from the objects.

In [438]:
clear = masking.make_mask(sr.pixel_qa, clear=True)
sr_clear = sr.where(clear)
sr_clear_mean = sr_clear.mean('time')


index = numpy.unique(mask)
labels = numpy.transpose(mask)

X = numpy.transpose(numpy.array([scipy.ndimage.measurements.mean(sr_clear_mean.red.values, labels, index), 
                                 scipy.ndimage.measurements.mean(sr_clear_mean.blue.values, labels, index),
                                 scipy.ndimage.measurements.mean(sr_clear_mean.green.values, labels, index),
                                 scipy.ndimage.measurements.mean(sr_clear_mean.nir.values, labels, index),
                                 scipy.ndimage.measurements.mean(sr_clear_mean.swir1.values, labels, index),
                                 scipy.ndimage.measurements.mean(sr_clear_mean.swir2.values, labels, index)]))

mapping = numpy.unique(new_tags, return_inverse=True)
y = mapping[1]

  if np.issubdtype(dtype, float):
  elif np.issubdtype(dtype, int):


A random forest classifier is trained with that data.

In [439]:
clf = RandomForestClassifier(random_state=0,oob_score=True)
clf.fit(numpy.nan_to_num(X),numpy.nan_to_num(y))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=0, verbose=0, warm_start=False)

A testing array is produced from the very same raster that we retrieved from the datacube. We will predict on this raster at a pixel level. It is just a test as it is kind of sneaky to use the same set to predict.

In [440]:
X_test = numpy.transpose(numpy.array([sr_clear_mean.red.values, 
                                      sr_clear_mean.blue.values,
                                      sr_clear_mean.green.values,
                                      sr_clear_mean.nir.values,
                                      sr_clear_mean.swir1.values,
                                      sr_clear_mean.swir2.values,]))

test_shape = X_test.shape

This is the array that we obtain after the prediction.

In [464]:
prediction = clf.predict(numpy.nan_to_num(numpy.reshape(X_test, (test_shape[0]*test_shape[1],test_shape[2])))).reshape(test_shape[0],test_shape[1])
print(prediction)

mapping[0]

[[2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 3 2]
 [2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2]
 [2 2 2 2 2 2 3 2 2 3 2 2 2 2 2 2 2 2 1]
 [2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2]
 [2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
 [2 2 1 2 2 2 2 2 3 2 2 2 3 2 2 2 2 2 2]
 [2 2 2 1 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2]
 [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 2 2]
 [2 2 2 2 2 2 2 2 2 1 3 3 2 2 2 2 2 3 2]
 [2 2 2 2 2 2 2 2 2 0 2 2 2 2 3 3 2 2 2]
 [2 2 2 2 2 2 2 3 2 1 2 2 2 3 2 2 2 2 2]
 [2 2 2 1 2 2 2 3 2 1 2 3 2 3 2 2 2 2 2]
 [3 2 2 1 1 1 2 2 1 2 3 2 2 2 3 2 2 2 2]
 [3 3 2 1 2 2 2 2 2 3 0 2 1 3 2 2 2 2 2]
 [2 3 2 2 2 3 3 3 1 3 0 2 3 2 2 2 2 2 2]
 [2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2]]


array(['agua', 'otros', 'praderas', 'tierras forestales'], dtype='<U18')

We use the training data to create a raster with the tags to compare with our result. The no data value is -9.

In [463]:
final = numpy.array([[numpy.where(mapping[0] == tags[e-1])[0][0] if e > 0 else -9 for e in row ] for row in mask])
print(final)
        


[[-9 -9 -9  2 -9 -9 -9 -9 -9  2 -9 -9 -9  2 -9 -9 -9 -9 -9]
 [-9 -9  2  2 -9 -9 -9 -9  2 -9 -9  2  2  2  2 -9 -9 -9 -9]
 [-9 -9  2  2 -9 -9 -9  2  2  2 -9 -9 -9  2  2 -9  2 -9 -9]
 [ 2  2  3 -9 -9 -9 -9  2  2 -9 -9 -9  2  2  2 -9 -9 -9 -9]
 [ 2  2  3 -9  3  3  3 -9 -9 -9  3 -9 -9  2  2 -9 -9 -9 -9]
 [ 2 -9 -9  3  3  3  2 -9 -9 -9 -9  3  3 -9  2  2 -9 -9 -9]
 [-9 -9 -9  3  2  2  2 -9 -9 -9  3  3 -9 -9 -9 -9 -9 -9 -9]
 [-9 -9 -9 -9  3  2  2  2  2  2  2  2  3  3  2  2 -9 -9 -9]
 [-9 -9 -9  3  3  3  3  2  2  2  2  3  3  2  2  2  2 -9 -9]
 [-9 -9 -9  3  3 -9  2  2  2  2  2  2  2  2  2  2  2 -9 -9]
 [-9 -9 -9  3 -9 -9  2  3  3  3  2  2  2  2  2  2  2 -9 -9]
 [-9 -9 -9 -9  2 -9  2  2  3  2  2  2  2  2  2  2  2 -9 -9]
 [-9 -9 -9  2  2 -9  2  2  2  2  2  2  2  3  2 -9 -9 -9 -9]
 [-9 -9 -9 -9  1  1  1  1  2  2  2  2  2  2  2  2 -9 -9 -9]
 [-9 -9 -9 -9 -9  1 -9 -9  1  2  3  2  2  2  2  2 -9 -9 -9]
 [-9  2  2 -9  1  1 -9 -9  1  2  3  2  2  1 -9 -9 -9 -9 -9]
 [-9 -9  2 -9  1  1 -9 -9 -9 -9  0  2 -9

(array(['agua', 'otros', 'praderas', 'tierras forestales'], dtype='<U18'),
 array([1, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 3,
        3, 3, 3, 2, 2, 2, 3, 1, 0, 2, 2, 1]))

In [459]:
counts = numpy.unique((prediction.reshape(test_shape[0],test_shape[1]) - final)[final != 9],return_counts=True)
print(counts)

(array([-3, -2, -1,  0,  1,  2,  9, 10, 11, 12]), array([  1,   3,  30, 104,  22,   3,   1,  12, 155,  11]))


In [444]:
clf.score(numpy.nan_to_num(X),numpy.nan_to_num(y))

0.9705882352941176