In [1]:
import os
import datacube
import numpy as np
import xarray as xr
import subprocess as sp
import geopandas as gpd
from odc.io.cgroups import get_cpu_quota
from datacube.utils.geometry import assign_crs
from multiprocessing import cpu_count

import sys
sys.path.insert(1, '../../Tools/')
# from dea_tools.bandindices import calculate_indices
# from dea_tools.classification import collect_training_data
from deafrica_tools.plotting import map_shapefile, rgb
from deafrica_tools.classification import predict_xr, collect_training_data
from deafrica_tools.bandindices import calculate_indices


import warnings
warnings.filterwarnings("ignore")

In [34]:
path = '../../clipped shape file/output5.geojson'
field = 'PXLVAL'

In [35]:
ncpus = round(cpu_count())
print('ncpus = ' + str(ncpus))

ncpus = 12


In [36]:
# Load input data shapefile
input_data = gpd.read_file(path)

# Plot first five rows
input_data.head()

Unnamed: 0,PXLVAL,geometry
0,0,POINT (80.95101 15.84262)
1,0,POINT (80.95657 15.83250)
2,0,POINT (80.95317 15.83991)
3,0,POINT (80.95931 15.84118)
4,0,POINT (80.96203 15.83354)


In [37]:
# Plot training data in an interactive map
input_data.explore(column=field)


In [38]:
# Set up our inputs to collect_training_data
zonal_stats = None

# Set up the inputs for the ODC query
# time = ("2023")
time = ('2023-03-15', '2023-05-01')
resolution = (-30, 30)
output_crs = 'epsg:6933'

In [39]:
# Generate a new datacube query object
# lat_range = (15.80418332, 15.85828652)
# lon_range = (80.78694696, 81.02203692)
lat_range = (15.832223663515174, 15.843536077393118)
lon_range = (80.95018386840822, 80.9685516357422)
measurements =  ['blue','green','red','nir','swir_1','swir_2']
query = {
    'x': lon_range,
    'y': lat_range,
    'time': time,
    'measurements': measurements,
    'resolution': resolution,
    'output_crs' : 'epsg:6933'
}

In [40]:
def feature_layers(query):
    #connect to the datacube
    dc = datacube.Datacube(app='feature_layers')

    #load ls8 geomedian
    
    ds = dc.load(product="s2a_sen2cor_granule",
                 **query)

    #calculate some band indices
    ds = calculate_indices(ds,
                           index=['NDVI', 'BUI', 'MNDWI'],
                           drop=False,
                           satellite_mission='s2')
    
    # print(ds)
    return ds

In [41]:
# %%time
# column_names, model_input = collect_training_data(
#     gdf=input_data,
#     dc_query=query,
#     ncpus=ncpus,
#     # return_coords=False,
#     field=field,
#     zonal_stats=zonal_stats,
    # feature_func=feature_layers)

column_names, model_input = collect_training_data(
                                    gdf=input_data,
                                    dc_query=query,
                                    ncpus=ncpus,
                                    field=field,
                                    zonal_stats=zonal_stats,
                                    feature_func=feature_layers
                                    )

Collecting training data in parallel mode


  0%|          | 0/70 [00:00<?, ?it/s]

Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (70, 10)


In [42]:
print(column_names)
print('')
print(np.array_str(model_input, precision=2, suppress_small=True))

['PXLVAL', 'blue', 'green', 'red', 'nir', 'swir_1', 'swir_2', 'NDVI', 'BUI', 'MNDWI']

[[   1.   1252.   1616.   1508.   3154.   2384.   1587.      0.35   -0.49
    -0.19]
 [   0.   1141.   1745.   1262.   4710.   2236.   1431.      0.58   -0.93
    -0.12]
 [   0.   1150.   1681.   1235.   6041.   2468.   1470.      0.66   -1.08
    -0.19]
 [   0.   1278.   1671.   1500.   3638.   2398.   1733.      0.42   -0.62
    -0.18]
 [   1.   1450.   1981.   1835.   4902.   2493.   1668.      0.46   -0.78
    -0.11]
 [   0.   1199.   1500.   1350.   2292.   1316.   1141.      0.26   -0.53
     0.07]
 [   0.   1113.   1620.   1233.   4880.   1981.   1340.      0.6    -1.02
    -0.1 ]
 [   0.   1231.   1704.   1417.   3799.   1983.   1419.      0.46   -0.77
    -0.08]
 [   0.   1342.   1560.   1688.   1414.   1591.   1280.     -0.09    0.15
    -0.01]
 [   0.   1193.   1701.   1340.   4740.   2407.   1529.      0.56   -0.89
    -0.17]
 [   0.   1287.   1671.   1454.   3585.   2027.   1372.      0.

In [43]:
# Set the name and location of the output file
output_file = "results/test_training_data2.txt"

In [44]:
# Export files to disk
np.savetxt(output_file, model_input, header=" ".join(column_names), fmt="%4f")