In [48]:
import rioxarray 
import xarray
import rasterio
import nasa_hls
import os
import urllib.request as urlreq
import pandas as pd
import fiona
import numpy as np
from glob import glob
from rasterio.enums import Resampling
from rasterio import Affine
from rasterio.crs import CRS
import matplotlib.pyplot as plt
from subprocess import Popen, PIPE

%matplotlib inline

In [50]:
##### START OPTIONS #####
yoi = [2020]
toi = ['15STT']
root_path = "/data/"
spath = root_path + f"CDL_HLS_dataframe{yoi[0]}.csv"
image_index_file = root_path + f"image_index{yoi[0]}.csv"
hdf_dir = root_path + "hdf/"
chip_geojson_dir = root_path + 'chip_geojson/'
chip_output_dir = root_path + 'chip_output/'
#####  END OPTIONS  #####

make folders if needed

In [34]:
try:
    os.makedirs(hdf_dir)
except FileExistsError:
    # directory already exists
    print('pass')
    pass


pass


query and download hdf files

In [35]:
HLSdf = nasa_hls.get_available_datasets(
        years = yoi,
        products = ["S30"],
        tiles = toi,
        return_list = False)
        
#HLSdf.to_csv(spath, mode='w')

100%|██████████| 1/1 [00:01<00:00,  1.00s/it]


In [21]:
for k in range(20):
    url = HLSdf.at[k, "url"]
    local_name = url.split('/')[-1].replace("\n", "").replace('.hdf', '')
    HLSdf.at[k, "image_id"] = local_name
    try:
        urlreq.urlretrieve(url, filename = hdf_dir+local_name + '.hdf')
    except:
        print(local_name + " failed")
        continue
               
HLSdf.to_csv(spath, mode='w')

In [39]:
HLSdf['url'].tolist()

['https://hls.gsfc.nasa.gov/data/v1.4/S30/2020/15/S/T/T/HLS.S30.T15STT.2020002.v1.4.hdf',
 'https://hls.gsfc.nasa.gov/data/v1.4/S30/2020/15/S/T/T/HLS.S30.T15STT.2020005.v1.4.hdf',
 'https://hls.gsfc.nasa.gov/data/v1.4/S30/2020/15/S/T/T/HLS.S30.T15STT.2020007.v1.4.hdf',
 'https://hls.gsfc.nasa.gov/data/v1.4/S30/2020/15/S/T/T/HLS.S30.T15STT.2020010.v1.4.hdf',
 'https://hls.gsfc.nasa.gov/data/v1.4/S30/2020/15/S/T/T/HLS.S30.T15STT.2020012.v1.4.hdf',
 'https://hls.gsfc.nasa.gov/data/v1.4/S30/2020/15/S/T/T/HLS.S30.T15STT.2020015.v1.4.hdf',
 'https://hls.gsfc.nasa.gov/data/v1.4/S30/2020/15/S/T/T/HLS.S30.T15STT.2020017.v1.4.hdf',
 'https://hls.gsfc.nasa.gov/data/v1.4/S30/2020/15/S/T/T/HLS.S30.T15STT.2020020.v1.4.hdf',
 'https://hls.gsfc.nasa.gov/data/v1.4/S30/2020/15/S/T/T/HLS.S30.T15STT.2020022.v1.4.hdf',
 'https://hls.gsfc.nasa.gov/data/v1.4/S30/2020/15/S/T/T/HLS.S30.T15STT.2020025.v1.4.hdf',
 'https://hls.gsfc.nasa.gov/data/v1.4/S30/2020/15/S/T/T/HLS.S30.T15STT.2020027.v1.4.hdf',
 'https://

extract hdf metadata, filter to 3 scenes per tile

In [4]:
def get_metadata_from_hdf_mine(src, fields=["cloud_cover", "spatial_coverage"]):
    """Get metadata from a nasa-hls hdf file. See HLS user guide for valid fields.
    
    HLS User Guide - see Section 6.6: 
    
    https://hls.gsfc.nasa.gov/wp-content/uploads/2019/01/HLS.v1.4.UserGuide_draft_ver3.1.pdf
    """
    band="QA"
    cmd = f'gdalinfo HDF4_EOS:EOS_GRID:"{src}":Grid:{band}'
    print('cmd')
    print(cmd)
    p = Popen(cmd, stdout=PIPE, shell=True)
    output, err = p.communicate()
    output = str(output)[2:-1].replace("\\n", "\n")
    rc = p.returncode
    metadata = {}
    for line in output.split("\n"):
        for field in fields:
            if field in line:
                metadata[field] = line.split("=")[1].strip()
                try:
                    metadata[field] = float(metadata[field])
                except:
                    pass
    for field in fields:
        if field not in metadata.keys():
            warnings.warn(f"Could not find metadata for field '{field}'.")
    return metadata

In [5]:
image_index = pd.DataFrame(columns = ['image_id', 'tile', 'date', 'month', 'cloud_coverage', 'spatial_coverage'])

candidate_hdf = sorted(glob(hdf_dir + '*.hdf'))

for img in candidate_hdf:
    print(img)
    local_name = img.split('/')[-1]
    try:
        print(hdf_dir+local_name)
        md = get_metadata_from_hdf_mine(hdf_dir+local_name)
    except:
        print(img + ' skipped')
        continue
    print(md)
    cloud_cover = int(md['cloud_cover'])
    spatial_coverage = int(md['spatial_coverage'])
    image_id = local_name.replace('.hdf', '')
    tname = local_name.split('.')[2]
    date = local_name.split('.')[3]
    image_date_string = image_id.split('.')[3]
    image_date = pd.to_datetime(image_date_string, format="%Y%j").date()
    image_month = image_date.month
    
    new_row = pd.DataFrame({'image_id':  [image_id],
               'tile': [tname],
               'date': [image_date],
               'month': [image_month],
               'cloud_coverage': [cloud_cover],
               'spatial_coverage': [spatial_coverage]})
    image_index = pd.concat([image_index, new_row], ignore_index = True)


image_index.to_csv(image_index_file)

/data/hdf/HLS.S30.T15STT.2020002.v1.4.hdf
/data/hdf/HLS.S30.T15STT.2020002.v1.4.hdf
cmd
gdalinfo HDF4_EOS:EOS_GRID:"/data/hdf/HLS.S30.T15STT.2020002.v1.4.hdf":Grid:QA
{'cloud_cover': 100.0, 'spatial_coverage': 100.0}
/data/hdf/HLS.S30.T15STT.2020005.v1.4.hdf
/data/hdf/HLS.S30.T15STT.2020005.v1.4.hdf
cmd
gdalinfo HDF4_EOS:EOS_GRID:"/data/hdf/HLS.S30.T15STT.2020005.v1.4.hdf":Grid:QA
{'cloud_cover': 1.0, 'spatial_coverage': 6.0}
/data/hdf/HLS.S30.T15STT.2020007.v1.4.hdf
/data/hdf/HLS.S30.T15STT.2020007.v1.4.hdf
cmd
gdalinfo HDF4_EOS:EOS_GRID:"/data/hdf/HLS.S30.T15STT.2020007.v1.4.hdf":Grid:QA
{'cloud_cover': 2.0, 'spatial_coverage': 100.0}
/data/hdf/HLS.S30.T15STT.2020010.v1.4.hdf
/data/hdf/HLS.S30.T15STT.2020010.v1.4.hdf
cmd
gdalinfo HDF4_EOS:EOS_GRID:"/data/hdf/HLS.S30.T15STT.2020010.v1.4.hdf":Grid:QA
{'cloud_cover': 94.0, 'spatial_coverage': 6.0}
/data/hdf/HLS.S30.T15STT.2020012.v1.4.hdf
/data/hdf/HLS.S30.T15STT.2020012.v1.4.hdf
cmd
gdalinfo HDF4_EOS:EOS_GRID:"/data/hdf/HLS.S30.T15STT.

In [6]:
image_index.image_id.tolist()

['HLS.S30.T15STT.2020002.v1.4',
 'HLS.S30.T15STT.2020005.v1.4',
 'HLS.S30.T15STT.2020007.v1.4',
 'HLS.S30.T15STT.2020010.v1.4',
 'HLS.S30.T15STT.2020012.v1.4',
 'HLS.S30.T15STT.2020015.v1.4',
 'HLS.S30.T15STT.2020017.v1.4',
 'HLS.S30.T15STT.2020020.v1.4',
 'HLS.S30.T15STT.2020022.v1.4',
 'HLS.S30.T15STT.2020025.v1.4',
 'HLS.S30.T15STT.2020027.v1.4',
 'HLS.S30.T15STT.2020030.v1.4',
 'HLS.S30.T15STT.2020032.v1.4',
 'HLS.S30.T15STT.2020035.v1.4',
 'HLS.S30.T15STT.2020040.v1.4',
 'HLS.S30.T15STT.2020042.v1.4',
 'HLS.S30.T15STT.2020045.v1.4',
 'HLS.S30.T15STT.2020047.v1.4',
 'HLS.S30.T15STT.2020050.v1.4',
 'HLS.S30.T15STT.2020052.v1.4']

Select 3 best images (need to loop this over tiles)

In [7]:
cloud_thres = 80
cand_images = image_index[(image_index.spatial_coverage == 100) & (image_index.cloud_coverage <= cloud_thres)]
cand_image_count = len(cand_images)
first_image = cand_images.head(1)
last_image = cand_images.tail(1)
middle_image = cand_images.head(cand_image_count // 2).tail(1)

selected_images = pd.concat([first_image, middle_image, last_image], ignore_index = True)

In [8]:
selected_images

Unnamed: 0,image_id,tile,date,month,cloud_coverage,spatial_coverage
0,HLS.S30.T15STT.2020007.v1.4,T15STT,2020-01-07,1,2,100
1,HLS.S30.T15STT.2020032.v1.4,T15STT,2020-02-01,2,3,100
2,HLS.S30.T15STT.2020052.v1.4,T15STT,2020-02-21,2,70,100


convert selected hdf to cog

In [28]:
def convert_hdf_to_cog(scene_id, product = "S30"):
    
    """
    This function receives the scene_id of an HLS scene (in a format similar to "HLS.S30.T14RNS.2020005.v1.4"
    and converts the scene from HDF format to COG. 
    
    Assumptions:
    - The corresponding HDF file for the scene is located at `/data/hdf/scene_id.hdf`
    - The output will be written to `/data/tif/scene_id/*.tif` and contains all the bands. 
    
    Inputs:
    - scene_id: The scene ID of the HLS scene
    - product: the HLS product ID. Default is S30, but it can be S30, L30, S30_ANGLES, L30_ANGLES
    
    """
    
    import os
    cmd = f"python3 /hls-hdf_to_cog/hls_hdf_to_cog/hls_hdf_to_cog.py --product {product} /data/hdf/{scene_id}.hdf --output-dir /data/tif/{scene_id}/"
    os.system(cmd)

In [31]:
for k in range(len(selected_images)):
    print(selected_images.at[k, 'image_id'])
    convert_hdf_to_cog(selected_images.at[k, 'image_id'])

HLS.S30.T15STT.2020007.v1.4
HLS.S30.T15STT.2020032.v1.4
HLS.S30.T15STT.2020052.v1.4


In [30]:
convert_hdf_to_cog('HLS.S30.T15STT.2020032.v1.4')

reproject selected cog to cdl crs

In [11]:
def reproject_hls_to_cdl(scene_folder,
                         bands = ["B01", "B02", "B03", "B04", "B05", "B06", "B07", "B08", "B8A", "B09", "B10", "B11", "B12"],
                         cdl_file = "/data/2022_30m_cdls_clipped.tif"):
    
    """
    This function receives the path to a folder that contains all GeoTIFF files (for various bands)
    of a HLS scene, and reprojects those to the target CDL CRS and grid. 
    
    Assumptions:
    - scene_folder has a file structure like: ".../<scene_id>/<scene_id>.<band_id>.tiff
    - scene_folder should not have a "/" at the end
    
    Inputs:
    - scene_folder: is the path to the folder that contains HLS GeoTIFF files for all bands of HLS
    - bands: list of bands of HLS that should be reprojected (default is all bands)
    - cdl_file: contains the path to the clipped CDL GeoTIFF file
    
    """
    
    for band in bands:
        xds = xarray.open_rasterio(f"{scene_folder}/{scene_folder.split('/')[-1]}.{band}.tif")
        cdl = xarray.open_rasterio(cdl_file)
        xds_new = xds.rio.reproject_match(cdl, resampling = Resampling.bilinear)
        xds_new.rio.to_raster(raster_path = f"{scene_folder}/{scene_folder.split('/')[-1]}.{band}.5070.tif")

In [13]:
for k in range(1,3):
    image_id = selected_images.at[k, 'image_id']
    print(image_id)
    reproject_hls_to_cdl("/data/tif/" + image_id,
                         bands = ["B01", "B02", "B03", "B04", "B05", "B06", "B07", "B08", "B8A", "B09", "B10", "B11", "B12"])

HLS.S30.T15STT.2020032.v1.4
HLS.S30.T15STT.2020052.v1.4


chipping

In [45]:
# this cell is a sample clipping of an HLS scene using a sample file `chip.geojson`

chips_list = sorted(glob(chip_geojson_dir + '*.geojson'))
print(chips_list)


['/data/chip_geojson/test_chip.geojson']


In [52]:


for k in range(len(chips_list)):
    chip = chips_list[k]
    ## get chip id
    chip_id = k
    
    ## get chip tile
    chip_tile = '15STT'
    
    ## get reprojected image paths
    selected_image_folders = sorted(glob(f'/data/tif/*{chip_tile}*'))
    print(selected_image_folders)
    
    assert len(selected_image_folders) == 3
    
    with fiona.open(chip, "r") as shapefile:
        shapes = [feature["geometry"] for feature in shapefile]
    
    bands = ["B01", "B02", "B03", "B04", "B05", "B06", "B07", "B08", "B8A", "B09", "B10", "B11", "B12"]
    first_date_images = sorted(glob(selected_image_folders[1] + '/*.5070.tif'))
    second_date_images = sorted(glob(selected_image_folders[1] + '/*.5070.tif'))
    third_date_images = sorted(glob(selected_image_folders[2] + '/*.5070.tif'))
    all_date_images = first_date_images + second_date_images + third_date_images
    
    assert len(all_date_images) == 3 * len(bands)

    ## do we want to scale/clip reflectances?
    bands = []
    
    ## band order?   
    out_meta.update({"driver": "GTiff",
                     "height": out_image.shape[1],
                     "width": out_image.shape[2],
                     "transform": out_transform})


    with rasterio.open(chip_output_dir + "chip_hls_id_" + str(chip_id) + ".tif", "w", **out_meta) as dest:
        dest.write(out_image)

 




['/data/tif/HLS.S30.T15STT.2020007.v1.4', '/data/tif/HLS.S30.T15STT.2020032.v1.4', '/data/tif/HLS.S30.T15STT.2020052.v1.4']
