Using the CDL tif to reproject each HLS scene to CDL projection. Run cdl_generate.ipynb before doing this to get the cdl tif file

In [16]:
import xarray
import rioxarray
import pandas as pd
import numpy as np
import pyproj
import multiprocessing as mp
from rasterio.enums import Resampling
import json
from pathlib import Path
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
cdl_file = "/data/requirements/cdl_raw/2022_30m_cdls.tif"

In [3]:
track_df = pd.read_csv("/home/data/track_df.csv")

In [4]:
def point_transform(coor, src_crs, target_crs=5070):
    proj = pyproj.Transformer.from_crs(src_crs, target_crs, always_xy=True)
    projected_coor = proj.transform(coor[0], coor[1])
    return [projected_coor[0], projected_coor[1]]

def find_nearest(array, value):
    idx = (np.abs(array - value)).argmin()
    return array[idx]

In [11]:
def reproject_hls(tile_path,
                  cdl_ds,
                  target_crs ="EPSG:5070", 
                  remove_original = True, 
                  resampling_method = Resampling.bilinear):
    
    """
    This function receives the path to a specific HLS tile and reproject it to the targeting crs_ds.
    The option of removing the raw HLS tile is provided
    
    Assumptions:
    - tile_path is a full path that end with .tif
    - cdl_ds is a rioxarray dataset that is opened with `cache=False` setting.
    
    
    Inputs:
    - tile_path: The full path to a specific HLS tile
    - target_crs: The crs that you wish to reproject the tile to, default is EPSG 4326
    - remove_original: The option to remove raw HLS tile after reprojecting, default is True
    - resampling_method: The method that rioxarray use to reproject, default is bilinear
    """

    xds = rioxarray.open_rasterio(tile_path)
    coor_min = point_transform([xds.x.min().data, xds.y.min().data], xds.rio.crs)
    coor_max = point_transform([xds.x.max().data, xds.y.max().data], xds.rio.crs)
    
    x0 = find_nearest(cdl_ds.x.data, coor_min[0])
    y0 = find_nearest(cdl_ds.y.data, coor_min[1])
    x1 = find_nearest(cdl_ds.x.data, coor_max[0])
    y1 = find_nearest(cdl_ds.y.data, coor_max[1])
    
    cdl_for_reprojection = cdl_ds.rio.slice_xy(x0, y0, x1, y1)
    
    xds_new = xds.rio.reproject_match(cdl_for_reprojection, resampling = resampling_method)

    if remove_original:
        if Path(tile_path).is_file():
            os.remove(tile_path)
        xds_new.rio.to_raster(raster_path = tile_path.replace(".tif", ".reproject.tif"))
    else:
        xds_new.rio.to_raster(raster_path = tile_path.replace(".tif", ".reproject.tif"))

In [34]:
# Add a quality control to ensure there are three scenes for each tile.
failed_tiles = []
for tile in list(track_df.tile.unique()):
    if len(track_df[track_df.tile == tile]) != 3:
        failed_tiles.append(tile)
if len(failed_tiles) == 0:
    print("All tiles passed the quality test!")
else:
    print(f"Tile {failed_tiles} does not pass the quality test.")
    

All tiles passed the quality test!


In [7]:
track_df["cdl_file"] = cdl_file
track_df.loc[:, "bands"] = '["B02","B03","B04","B8A","B11","B12","Fmask"]'

In [8]:
track_df.head()

Unnamed: 0,tile,timestep,date,save_path,filename,cdl_file,bands
0,T10SDJ,0,2022-03-08,/data/tiles/HLS.S30.T10SDJ.2022067T190231.v2.0/,HLS.S30.T10SDJ.2022067T190231.v2.0,/data/requirements/cdl_raw/2022_30m_cdls.tif,"[""B02"",""B03"",""B04"",""B8A"",""B11"",""B12"",""Fmask""]"
1,T10SDJ,1,2022-07-16,/data/tiles/HLS.S30.T10SDJ.2022197T185931.v2.0/,HLS.S30.T10SDJ.2022197T185931.v2.0,/data/requirements/cdl_raw/2022_30m_cdls.tif,"[""B02"",""B03"",""B04"",""B8A"",""B11"",""B12"",""Fmask""]"
2,T10SDJ,2,2022-09-29,/data/tiles/HLS.S30.T10SDJ.2022272T190159.v2.0/,HLS.S30.T10SDJ.2022272T190159.v2.0,/data/requirements/cdl_raw/2022_30m_cdls.tif,"[""B02"",""B03"",""B04"",""B8A"",""B11"",""B12"",""Fmask""]"
3,T10SEH,0,2022-03-10,/data/tiles/HLS.S30.T10SEH.2022069T185109.v2.0/,HLS.S30.T10SEH.2022069T185109.v2.0,/data/requirements/cdl_raw/2022_30m_cdls.tif,"[""B02"",""B03"",""B04"",""B8A"",""B11"",""B12"",""Fmask""]"
4,T10SEH,1,2022-06-23,/data/tiles/HLS.S30.T10SEH.2022174T184931.v2.0/,HLS.S30.T10SEH.2022174T184931.v2.0,/data/requirements/cdl_raw/2022_30m_cdls.tif,"[""B02"",""B03"",""B04"",""B8A"",""B11"",""B12"",""Fmask""]"


In [12]:
def hls_process(kwargs):

    remove_original = True
    
    save_path = kwargs["save_path"]
    filename= kwargs["filename"]
    bands = json.loads(kwargs["bands"])    
    cdl_file = kwargs["cdl_file"]
    
    cdl_ds = rioxarray.open_rasterio(cdl_file, cache=False)

    for band in bands:
        tile_path = f"{save_path}{filename}.{band}.tif"
        if band == "Fmask":
            reproject_hls(tile_path, cdl_ds, remove_original, resampling_method = Resampling.nearest)
        else :
            reproject_hls(tile_path, cdl_ds, remove_original)                    

In [17]:
with mp.Pool(processes=mp.cpu_count()) as pool:
    pool.map(hls_process, track_df.to_dict('records'))