In [None]:
import os
import datacube
import numpy as np
import xarray as xr
import subprocess as sp
import geopandas as gpd
from odc.io.cgroups import get_cpu_quota
from datacube.utils.geometry import assign_crs
from multiprocessing import cpu_count

import sys
sys.path.insert(1, '../../Tools/')
# from dea_tools.bandindices import calculate_indices
# from dea_tools.classification import collect_training_data
from deafrica_tools.plotting import map_shapefile, rgb
from deafrica_tools.classification import predict_xr, collect_training_data
from deafrica_tools.bandindices import calculate_indices


import warnings
warnings.filterwarnings("ignore")

In [None]:
path = '../../clipped shape file/mod_final.geojson'
field = 'PXLVAL'

In [None]:
ncpus = round(cpu_count())
print('ncpus = ' + str(ncpus))

ncpus = 12


In [None]:
# Load input data shapefile
input_data = gpd.read_file(path)

# Plot first five rows
input_data.head()

Unnamed: 0,PXLVAL,geometry
0,0,"POLYGON ((80.94555 15.80580, 81.00185 15.80580..."
1,1,"MULTIPOLYGON (((80.99578 15.85489, 80.99600 15..."
2,1,"MULTIPOLYGON (((80.97289 15.85111, 80.97311 15..."
3,1,"MULTIPOLYGON (((80.97133 15.85067, 80.97178 15..."
4,1,"MULTIPOLYGON (((80.99200 15.85178, 80.99222 15..."


In [None]:
# Plot training data in an interactive map
input_data.explore(column=field)


In [None]:
# Set up our inputs to collect_training_data
zonal_stats = None

# Set up the inputs for the ODC query
# time = ("2023")
time = ('2023-03-15', '2023-05-01')
resolution = (-30, 30)
output_crs = 'epsg:6933'

In [None]:
# Generate a new datacube query object
# lat_range = (15.80418332, 15.85828652)
# lon_range = (80.78694696, 81.02203692)
lat_range = (15.70501873999827, 15.855673509998681)
lon_range = (80.73028564453126, 80.89920043945314)
measurements =  ['blue','green','red','nir','swir_1','swir_2']
query = {
    'x': lon_range,
    'y': lat_range,
    'time': time,
    'measurements': measurements,
    'resolution': resolution,
    'output_crs' : 'epsg:6933'
}

In [None]:
def feature_layers(query):
    #connect to the datacube
    dc = datacube.Datacube(app='feature_layers')

    #load ls8 geomedian
    
    ds = dc.load(product="s2a_sen2cor_granule",
                 **query)

    #calculate some band indices
    ds = calculate_indices(ds,
                           index=['NDVI', 'BUI', 'MNDWI'],
                           drop=False,
                           satellite_mission='s2')
    return ds

In [None]:
# %%time
# column_names, model_input = collect_training_data(
#     gdf=input_data,
#     dc_query=query,
#     ncpus=ncpus,
#     # return_coords=False,
#     field=field,
#     zonal_stats=zonal_stats,
    # feature_func=feature_layers)

column_names, model_input = collect_training_data(
                                    gdf=input_data,
                                    dc_query=query,
                                    ncpus=ncpus,
                                    field=field,
                                    zonal_stats=zonal_stats,
                                    feature_func=feature_layers
                                    )

Collecting training data in parallel mode


  0%|          | 0/53 [00:00<?, ?it/s]

Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (52252, 10)


In [None]:
print(column_names)
print('')
print(np.array_str(model_input, precision=2, suppress_small=True))

['PXLVAL', 'blue', 'green', 'red', 'nir', 'swir_1', 'swir_2', 'NDVI', 'BUI', 'MNDWI']

[[   1.   1161.   1669.   ...    0.59   -0.96   -0.15]
 [   1.   1133.   1772.   ...    0.57   -0.93   -0.1 ]
 [   1.   1213.   1539.   ...    0.38   -0.59   -0.13]
 ...
 [   1.   1190.   1638.   ...    0.54   -0.87   -0.15]
 [   1.   1164.   1649.   ...    0.54   -0.86   -0.15]
 [   1.   1213.   1623.   ...    0.49   -0.78   -0.15]]


In [None]:
# Set the name and location of the output file
output_file = "results/test_training_data.txt"

In [None]:
# Export files to disk
np.savetxt(output_file, model_input, header=" ".join(column_names), fmt="%4f")