In [1]:
import rioxarray 
import xarray
import rasterio
import nasa_hls
import os
import geopandas
import urllib.request as urlreq
import pandas as pd
import fiona
import numpy as np
import json
import shutil
import datetime
from glob import glob
from rasterio.enums import Resampling
from rasterio import Affine
from rasterio.crs import CRS
import matplotlib.pyplot as plt
from subprocess import Popen, PIPE

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

%matplotlib inline

In [2]:
##### START OPTIONS #####
yoi = [2021]
#toi = ['15STT']
cloud_thres = 5
valid_months = [3,4,5,6,7,8,9]
root_path = "/data/"

## file paths
spath = root_path + f"CDL_HLS_dataframe{yoi[0]}"
image_index_file = root_path + f"image_index{yoi[0]}"
chip_file =  root_path + "chip_bbox.geojson"
chip_csv = root_path + "chip_tracker.csv"
kml_file = root_path + 'sentinel_tile_grid.kml'
cdl_reclass_csv = root_path + "cdl_freq.csv"
tile_tracker_csv = root_path + "tile_tracker.csv"

## folder paths
hdf_dir = root_path + "hdf/"
chip_dir = root_path + 'chips/'
tif_dir = root_path + 'tif/'
chip_dir_binary = root_path + 'chips_binary/'
chip_dir_multi = root_path + 'chips_multi/'

chip_dir_filt = root_path + 'chips_filtered/'
chip_dir_binary_filt = root_path + 'chips_binary_filtered/'
chip_dir_multi_filt = root_path + 'chips_multi_filtered/'

chip_qa_dir = root_path + 'chips_qa/'

#####  END OPTIONS  #####

make folders if needed

In [3]:
dirs_to_make = [hdf_dir, chip_dir, chip_dir_binary, chip_qa_dir, chip_dir_binary, chip_dir_multi, chip_dir_binary_filt, chip_dir_filt, chip_dir_multi_filt]
for folder in dirs_to_make:
    try:
        os.makedirs(folder)
    except FileExistsError:
        # directory already exists
        print('pass')
        pass


pass
pass
pass
pass
pass
pass
pass
pass
pass


0 determine HLS tiles

In [4]:
with open("/cdl_training_data/data/chip_bbox.geojson", "r") as file:
    chips = json.load(file)
    
chip_ids = []
chip_x = []
chip_y = []
for item in chips['features']:
    #print(item)
    chip_ids.append(item['properties']['id'])
    chip_x.append(item['properties']['center'][0])
    chip_y.append(item['properties']['center'][1])


#chip_ids = a.fea
#print(a['features'][0]['properties']['center'])

In [5]:
# Load the HLS tiles and place there coordinates into a numpy array for processing later

fiona.drvsupport.supported_drivers['KML'] = 'rw'
tile_src = geopandas.read_file(kml_file, driver='KML')
tile_name = []
tile_x = []
tile_y = []
for tile_ind in range(tile_src.shape[0]):
    tile_name.append(tile_src.iloc[tile_ind].Name)
    tile_x.append(tile_src.iloc[tile_ind].geometry.centroid.x)
    tile_y.append(tile_src.iloc[tile_ind].geometry.centroid.y)
tile_name = np.array(tile_name)
tile_x = np.array(tile_x)
tile_y = np.array(tile_y)
tile_src = pd.concat([tile_src, tile_src.bounds], axis = 1)
#del tile_src
tile_src.head(5)

Unnamed: 0,Name,Description,geometry,minx,miny,maxx,maxy
0,01CCV,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -7...,-180.0,-73.064633,180.0,-72.012478
1,01CDH,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -8...,-180.0,-83.835334,180.0,-82.79672
2,01CDJ,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -8...,-180.0,-82.939452,180.0,-81.906947
3,01CDK,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -8...,-180.0,-82.044055,180.0,-81.016439
4,01CDL,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -8...,-180.0,-81.14807,180.0,-80.124456


In [6]:
def find_tile(x,y):
# Identify closest tile
    s = (tile_x - x)**2+(tile_y - y)**2
    tname = tile_name[np.argmin(s)]
    return(tname)

initialize chip tracker csv

In [None]:
chip_df = pd.DataFrame({"chip_id" : chip_ids, "chip_x" : chip_x, "chip_y" : chip_y})
chip_df['tile'] = chip_df.apply(lambda row : find_tile(row['chip_x'], row['chip_y']), axis = 1)

In [None]:
## write to csv
check_file = glob(chip_csv)
if len(check_file) == 0:
    chip_df.to_csv(chip_csv, index=False)
else:
    print('file exists')

In [None]:
tiles = chip_df.tile.unique().tolist()
tiles

0a. manually chack and remove "bad" tiles

In [None]:
chip_df[chip_df.tile == '01SBU'].head(5)

In [None]:
tiles.remove('01SBU')
tiles

0b. Make tile tracker

In [None]:
tile_tracker = pd.DataFrame({"tile":tiles})
tile_tracker['exclude'] = False
tile_tracker['hdf_download'] = False
tile_tracker['tif_convert'] = False
tile_tracker['tif_reproject'] = False
tile_tracker['chip'] = False
tile_tracker['filter_chips'] = False
#tile_tracker.head(50)

In [None]:
## update tracker
tiles_already_downloaded = glob(hdf_dir + '*')
tiles_already_downloaded = set([i[19:24] for i in tiles_already_downloaded])
tiles_already_downloaded
tile_tracker.loc[tile_tracker.tile.isin(tiles_already_downloaded) , 'hdf_download'] = True

tiles_already_converted = glob(tif_dir + '*')
tiles_already_converted = set([i[19:24] for i in tiles_already_converted])
tiles_already_converted
tile_tracker.loc[tile_tracker.tile.isin(tiles_already_converted) , 'tif_convert'] = True

chips_already_chipped = glob(chip_dir + '*')
chips_already_chipped = set([i[17:24] for i in chips_already_chipped])
#print(chips_already_chipped)
tiles_already_chipped = chip_df[chip_df.chip_id.isin(chips_already_chipped)].tile.unique()
#print(tiles_already_chipped)
tile_tracker.loc[tile_tracker.tile.isin(tiles_already_chipped) , 'tif_reproject'] = True
tile_tracker.loc[tile_tracker.tile.isin(tiles_already_chipped) , 'chip'] = True
                                        
chips_already_filtered = glob(chip_dir_filt + '*')
chips_already_filtered = set([i[26:33] for i in chips_already_filtered])
tiles_already_filtered = chip_df[chip_df.chip_id.isin(chips_already_filtered)].tile.unique()
tile_tracker.loc[tile_tracker.tile.isin(tiles_already_filtered), 'filter_chips'] = True

tile_tracker.head(50)

In [None]:
## write to csv
check_file = glob(tile_tracker_csv)
if len(check_file) == 0:
    tile_tracker.to_csv(tile_tracker_csv, index=False)
else:
    print('file exists')

1. query and download hdf files

In [None]:
toi = tile_tracker[(tile_tracker.exclude == False) & (tile_tracker.hdf_download == False)].tile.unique()
toi

1a. get URLs of hdf to download

In [None]:
HLSdf = nasa_hls.get_available_datasets(
        years = yoi,
        products = ["S30"],
        tiles = toi,
        return_list = False)

#HLSdf.to_csv(spath, mode='w')

In [None]:
HLSdf['month'] = pd.DatetimeIndex(HLSdf['date']).month

In [None]:
## filter by month
HLSdf = HLSdf[HLSdf.month.isin(valid_months)].reset_index(drop = True)
HLSdf.shape

In [None]:
## download hdf
for k in range(len(HLSdf)):
    url = HLSdf.at[k, "url"]
    local_name = url.split('/')[-1].replace("\n", "").replace('.hdf', '')
    HLSdf.at[k, "image_id"] = local_name
    try:
        urlreq.urlretrieve(url, filename = hdf_dir+local_name + '.hdf')
    except:
        print(local_name + " failed")
        continue

tile_tracker = pd.read_csv(tile_tracker_csv)
tile_tracker.loc[tile_tracker.tile.isin(toi) , 'hdf_download'] = True
tile_tracker.to_csv(tile_tracker_csv, index=False)
#ct = datetime.datetime.now()
#HLSdf.to_csv(spath + "_" + str(ct) + ".csv", mode='w')

In [None]:
tile_tracker.head(50)

2. extract hdf metadata, filter to 3 scenes per tile, convert to tif

In [None]:
def get_metadata_from_hdf_mine(src, fields=["cloud_cover", "spatial_coverage"]):
    """Get metadata from a nasa-hls hdf file. See HLS user guide for valid fields.
    
    HLS User Guide - see Section 6.6: 
    
    https://hls.gsfc.nasa.gov/wp-content/uploads/2019/01/HLS.v1.4.UserGuide_draft_ver3.1.pdf
    """
    band="QA"
    cmd = f'gdalinfo HDF4_EOS:EOS_GRID:"{src}":Grid:{band}'
#    print(cmd)
    p = Popen(cmd, stdout=PIPE, shell=True)
    output, err = p.communicate()
    output = str(output)[2:-1].replace("\\n", "\n")
    rc = p.returncode
    metadata = {}
    for line in output.split("\n"):
        for field in fields:
            if field in line:
                metadata[field] = line.split("=")[1].strip()
                try:
                    metadata[field] = float(metadata[field])
                except:
                    pass
    for field in fields:
        if field not in metadata.keys():
            warnings.warn(f"Could not find metadata for field '{field}'.")
    return metadata

In [None]:
image_index = pd.DataFrame(columns = ['image_id', 'tile', 'date', 'month', 'cloud_coverage', 'spatial_coverage'])

candidate_hdf = sorted(glob(hdf_dir + '*.hdf'))

for img in candidate_hdf:
   # print(img)
    local_name = img.split('/')[-1]
    try:
      #  print(hdf_dir+local_name)
        md = get_metadata_from_hdf_mine(hdf_dir+local_name)
    except:
        print(img + ' skipped')
        continue
   # print(md)
    cloud_cover = int(md['cloud_cover'])
    spatial_coverage = int(md['spatial_coverage'])
    image_id = local_name.replace('.hdf', '')
    tname = local_name.split('.')[2][1:]
    date = local_name.split('.')[3]
    image_date_string = image_id.split('.')[3]
    image_date = pd.to_datetime(image_date_string, format="%Y%j").date()
    image_month = image_date.month
    
    new_row = pd.DataFrame({'image_id':  [image_id],
               'tile': [tname],
               'date': [image_date],
               'month': [image_month],
               'cloud_coverage': [cloud_cover],
               'spatial_coverage': [spatial_coverage]})
    image_index = pd.concat([image_index, new_row], ignore_index = True)

ct = datetime.datetime.now()
image_index.to_csv(image_index_file + "_" + str(ct) + '.csv', index=False)

In [None]:
#image_index.sort_values(['cloud_coverage']).head(300)

Select 3 best images (need to loop this over tiles)

In [None]:
tile_tracker = pd.read_csv(tile_tracker_csv)
#tiles_already_converted = set([i[19:24] for i in tiles_already_converted])
tiles_to_process = tile_tracker[(tile_tracker.exclude == False) & (tile_tracker.tif_convert == False) & (tile_tracker.hdf_download == True)].tile.unique()
tiles_to_process
#tile_tracker.to_csv(tile_tracker_csv)

In [None]:
def convert_hdf_to_cog(scene_id, product = "S30"):
    
    """
    This function receives the scene_id of an HLS scene (in a format similar to "HLS.S30.T14RNS.2020005.v1.4"
    and converts the scene from HDF format to COG. 
    
    Assumptions:
    - The corresponding HDF file for the scene is located at `/data/hdf/scene_id.hdf`
    - The output will be written to `/data/tif/scene_id/*.tif` and contains all the bands. 
    
    Inputs:
    - scene_id: The scene ID of the HLS scene
    - product: the HLS product ID. Default is S30, but it can be S30, L30, S30_ANGLES, L30_ANGLES
    
    """
    
    import os
    cmd = f"python3 /hls-hdf_to_cog/hls_hdf_to_cog/hls_hdf_to_cog.py --product {product} /data/hdf/{scene_id}.hdf --output-dir /data/tif/{scene_id}/"
    os.system(cmd)
    image_folder = '/data/tif/' + scene_id + '/'
    tif_count = len(glob(image_folder + '*.tif'))
    if(tif_count == 14):
        return(True)
    else:
        shutil.rmtree(image_folder)
        return(False)


convert selected hdf to cog

In [None]:
def convert_first_date(cand_images):
    """
    Converts first date image from data frame. 
    If conversion fails, the image is removed and the next "first" image is tried.
    Returns the converted image row, the image id, and the data frame with any failed images removed.
    """
    process_first = False
    while process_first == False:
        first_image = cand_images.head(1)
        first_image_id = (first_image.image_id.tolist())[0]
        print(first_image_id)
        process_first = convert_hdf_to_cog(first_image_id)
        print(process_first)
        if(process_first == False):
            cand_images = cand_images[cand_images.image_id != first_image_id]
    return(first_image, first_image_id, cand_images)

def convert_last_date(cand_images):
    """
    Converts last date image from data frame. 
    If conversion fails, the image is removed and the next "last" image is tried.
    Returns the converted image row, the image id, and the data frame with any failed images removed.
    """
    process_last = False
    while process_last == False:
        last_image = cand_images.tail(1)
        last_image_id = (last_image.image_id.tolist())[0]
        print(last_image_id)
        process_last = convert_hdf_to_cog(last_image_id)
        print(process_last)
        if(process_last == False):
            cand_images = cand_images[cand_images.image_id != last_image_id]
    return(last_image, last_image_id, cand_images)

def convert_middle_date(cand_images):
    """
    Converts middle date image from data frame. 
    If conversion fails, the image is removed and the next "middle" image is tried.
    Returns the converted image row, the image id, and the data frame with any failed images removed.
    """
    process_middle = False
    cand_image_count = len(cand_images)
    while process_middle == False:
        middle_image = cand_images.head((cand_image_count // 2)+1).tail(1)
        middle_image_id = (middle_image.image_id.tolist())[0]
        print(middle_image_id)
        process_middle = convert_hdf_to_cog(middle_image_id)
        print(process_middle)
        if(process_middle == False):
            cand_images = cand_images[cand_images.image_id != middle_image_id]
    return(middle_image, middle_image_id, cand_images)




In [None]:
def delete_hdf(image_id):
    hdf = glob(hdf_dir + '*' + image_id + '*')
    for h in hdf:
        os.remove(h)

In [None]:
image_index['converted'] = False

for tile in tiles_to_process:
    print(tile)
    if tile == "15SVR": ## remove edge case
        continue
        
    ## set initial spatial threshold
    temp_thres = 100

    cand_images = image_index[(image_index.tile == tile) &(image_index.spatial_coverage == 100) & (image_index.cloud_coverage <= cloud_thres)]
    print(cand_images)
    print(len(cand_images))
    if len(cand_images) < 3:
        temp_thres = 90
        cand_images = image_index[(image_index.tile == tile) &(image_index.spatial_coverage >= temp_thres) & (image_index.cloud_coverage <= cloud_thres)]
        print(len(cand_images))
    if len(cand_images) < 3:
        temp_thres = 80
        cand_images = image_index[(image_index.tile == tile) &(image_index.spatial_coverage >= temp_thres) & (image_index.cloud_coverage <= cloud_thres)]
        print(len(cand_images))
    if len(cand_images) < 3:
        temp_thres = 70
        cand_images = image_index[(image_index.tile == tile) &(image_index.spatial_coverage >= temp_thres) & (image_index.cloud_coverage <= cloud_thres)]
        print(len(cand_images))
    if len(cand_images) < 3:
        temp_thres = 60
        cand_images = image_index[(image_index.tile == tile) &(image_index.spatial_coverage >= temp_thres) & (image_index.cloud_coverage <= cloud_thres)]
        print(len(cand_images))
    if len(cand_images) < 3:
        temp_thres = 50
        cand_images = image_index[(image_index.tile == tile) &(image_index.spatial_coverage >= temp_thres) & (image_index.cloud_coverage <= cloud_thres)]
        print(len(cand_images))
    if len(cand_images) < 3:
        print(tile + ' skipped')
        continue
    print('final spatial threshold ' + str(temp_thres))

    # if len(cand_images) < 4:
    #     print(tile + ' skipped')
    #     continue
    first_image, first_image_id, cand_images = convert_first_date(cand_images)
    last_image, last_image_id, last_images = convert_last_date(cand_images)
    middle_image, middle_image_id, middle_images = convert_middle_date(cand_images)

    selected_images = pd.concat([first_image, middle_image, last_image], ignore_index = True)
    #print(selected_images)
    
    image_index.loc[image_index.image_id == first_image_id  , 'converted'] = True
    image_index.loc[image_index.image_id == middle_image_id  , 'converted'] = True
    image_index.loc[image_index.image_id == last_image_id  , 'converted'] = True


    assert len(selected_images) == 3
    assert len(selected_images.image_id.unique()) == 3
    
    ## update tile tracker
    tile_tracker = pd.read_csv(tile_tracker_csv)
    tile_tracker.loc[tile_tracker.tile == tile , 'tif_convert'] = True
    tile_tracker.loc[tile_tracker.tile == tile , 'spatial_cov'] = temp_thres

    tile_tracker.to_csv(tile_tracker_csv, index=False)
    
    # images_to_delete = image_index[(image_index.tile == tile) & (~image_index.image_id.isin([first_image_id, middle_image_id, last_image_id]))]
    # a = images_to_delete.image_id.tolist()    
    # for x in a:
    #     delete_hdf(x)

3. reproject selected cog to cdl crs

In [None]:
def reproject_hls_to_cdl(scene_folder,
                         bands = ["B02", "B03", "B04", "B8A", "B11", "B12", "QA"],
                         cdl_file = "/data/2021_30m_cdls_clipped.tif"):
    
    """
    This function receives the path to a folder that contains all GeoTIFF files (for various bands)
    of a HLS scene, and reprojects those to the target CDL CRS and grid. 
    
    Assumptions:
    - scene_folder has a file structure like: ".../<scene_id>/<scene_id>.<band_id>.tiff
    - scene_folder should not have a "/" at the end
    
    Inputs:
    - scene_folder: is the path to the folder that contains HLS GeoTIFF files for all bands of HLS
    - bands: list of bands of HLS that should be reprojected (default is all bands)
    - cdl_file: contains the path to the clipped CDL GeoTIFF file
    
    """
    
    for band in bands:
        xds = xarray.open_rasterio(f"{scene_folder}/{scene_folder.split('/')[-1]}.{band}.tif")
        cdl = xarray.open_rasterio(cdl_file)
        xds_new = xds.rio.reproject_match(cdl, resampling = Resampling.bilinear)
        xds_new.rio.to_raster(raster_path = f"{scene_folder}/{scene_folder.split('/')[-1]}.{band}.5070.tif")

In [None]:
tile_tracker = pd.read_csv(tile_tracker_csv)
#tiles_already_converted = set([i[19:24] for i in tiles_already_converted])
tiles_to_reproject = tile_tracker[(tile_tracker.exclude == False) & (tile_tracker.tif_convert == True) & (tile_tracker.tif_reproject == False) ].tile.unique()
#tiles_to_reproject = tiles_to_reproject[0:3]
tiles_to_reproject

# (image_index[image_index.converted == True]).reset_index(drop = True)
# print(selected_images)

In [None]:
# bad_columns = tile_tracker.columns.tolist()
# bad_columns = [column for column in bad_columns if 'Unnamed' in column]
# print(bad_columns)
# tile_tracker = tile_tracker.drop(bad_columns, axis=1)
# tile_tracker.to_csv(tile_tracker_csv, index=False)
# tile_tracker

In [None]:
# chip_df = pd.read_csv(chip_csv)
# bad_columns = chip_df.columns.tolist()
# bad_columns = [column for column in bad_columns if 'Unnamed' in column]
# print(bad_columns)
# chip_df = chip_df.drop(bad_columns, axis=1)
# chip_df.to_csv(chip_df_csv, index=False)
# chip_df

In [None]:
tile_tracker = pd.read_csv(tile_tracker_csv)
tile_tracker

In [None]:
for tile in tiles_to_reproject:
    selected_images = glob(tif_dir + '*')
    #print(selected_images)
    selected_images = [image for image in selected_images if image[19:24] == tile]
    print(selected_images)
        ## reproject to cdl
    for k in range(len(selected_images)):
        image_id = selected_images[k]
        print(image_id)
        reproject_hls_to_cdl(image_id)
    tile_tracker = pd.read_csv(tile_tracker_csv)
    tile_tracker.loc[tile_tracker.tile == tile , 'tif_reproject'] = True
    tile_tracker.to_csv(tile_tracker_csv, index=False)

4. chipping

In [8]:
tile_tracker = pd.read_csv(tile_tracker_csv)
tiles_to_chip = tile_tracker[(tile_tracker.exclude == False) & (tile_tracker.tif_reproject == True) & (tile_tracker.chip == False) ].tile.unique()
tiles_to_chip

array(['14SMC', '14SMB', '14SMA', '14SNC', '14SNB', '14SNA', '14SNE',
       '14SND', '14SNF', '14SPA', '14SPC', '14SPE', '14SPD', '14SPF',
       '14SQA'], dtype=object)

In [9]:
def check_qa(qa_path, shape,  valid_qa = [0, 4, 32, 36, 64, 68, 96, 100, 128, 132, 160, 164, 192, 196, 224, 228]):
    
    """
    This function receives a path to a qa file, and a geometry. It clips the QA file to the geometry. 
    It returns the number of valid QA pixels in the geometry, and the clipped values.
    
    Assumptions: The valid_qa values are taken from Ben Mack's post:
    https://benmack.github.io/nasa_hls/build/html/tutorials/Working_with_HLS_datasets_and_nasa_hls.html
    
    Inputs:
    - qa_path: full path to reprojected QA tif file
    - shape: 'geometry' property of single polygon feature read by fiona
    - valid_qa: list of integer values that are 'valid' for QA band.
    

    
    """
    with rasterio.open(qa_path) as src:
        out_image, out_transform = rasterio.mask.mask(src, shape, crop=True)
      #  print(out_image.shape)
        vals = out_image.flatten()
        unique, counts = np.unique(vals, return_counts=True)
        qa_df = pd.DataFrame({"qa_val" : unique, "counts" : counts})
        qa_df
        qa_df[~ qa_df.qa_val.isin(valid_qa)].sort_values(['counts'], ascending = False)
        qa_df['pct'] = (100 *qa_df['counts'])/(224.0 * 224.0)
        
        bad_qa = qa_df[~ qa_df.qa_val.isin(valid_qa)].sort_values(['counts'], ascending = False)
        if len(bad_qa) > 0:
            highest_invalid_percent = bad_qa.pct.tolist()[0]
        else: 
            highest_invalid_percent = 0
        #ncell = len(vals)
        valid_count = sum(x in valid_qa for x in vals)
        return(valid_count, highest_invalid_percent, out_image[0])

    

In [10]:
## set up CDL reclass
cdl_class_df = pd.read_csv(cdl_reclass_csv)
crop_dict = dict(zip(cdl_class_df.CDL_val, cdl_class_df.new_class_value))

In [11]:
def crop_reclass(x):
    ## binary reclass
    crop_classes = [1,2,3,4,5,6,10,11,12,13,14,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,66,67,68,69,70,71,72,74,75,76,77,92,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,236,237,238,240,241,242,243,244,245,246,247,248,249,250,254]
    return(crop_classes.count(x))

c_rcl = np.vectorize(crop_reclass)


def crop_multi(x):
    return(crop_dict[x])


c_multi = np.vectorize(crop_multi)

In [12]:
def reorder_bands(file_list, band_order = ["B02", "B03", "B04", "B8A", "B11", "B12", "QA"]):
    reordered = []
    for band in band_order:
        band_dots = '.' + band + '.'
        file_name = [s for s in file_list if band_dots in s]
        print('file_name')
        print(file_name)
        assert (len(file_name) == 1)
        reordered.append(file_name[0])
    print(reordered)
    return(reordered)

In [13]:
def process_chip(chip_id, 
                 chip_tile,
                 shape,
                 bands = ["B02", "B03", "B04", "B8A", "B11", "B12", "QA"]):
    
    """
    This function receives a chip id, HLS tile, chip geometry, and a list of bands to process. 
    
    Assumptions:
    
    Inputs:
    - chip_id: string of chip id, e.g. '000_001'
    - chip_tile: string of HLS tile , e.g. '15ABC'
    - shape: 'geometry' property of single polygon feature read by fiona
    
    The function writes out a multi-date TIF containing the bands for each of the three image dates for an HLS tile. 
    The function writes out a multi-date TIF containing the QA bands of each date.
    The function writes out a chipped version of CDL. 
    The function calls check_qa(), which makes assumptions about what QA pixels are valid.
    The function returns the number of valid QA pixels at each date, as a tuple.
    
    """
    ## get reprojected image paths
    selected_image_folders = sorted(glob(f'/data/tif/*T{chip_tile}*'))
   # print(selected_image_folders)
    
    assert len(selected_image_folders) == 3
    
    first_image_date = selected_image_folders[0][25:32]
    second_image_date = selected_image_folders[1][25:32]
    third_image_date = selected_image_folders[2][25:32]
    
    first_date_images = sorted(glob(selected_image_folders[0] + '/*.5070.tif'))
    
    first_date_images = reorder_bands(first_date_images, band_order = bands)
    first_date_qa = [x for x in first_date_images if '.QA.' in x][0]
    first_date_images.remove(first_date_qa)
    
    second_date_images = sorted(glob(selected_image_folders[1] + '/*.5070.tif'))
    second_date_images = reorder_bands(second_date_images, band_order = bands)

    second_date_qa = [x for x in second_date_images if '.QA.' in x][0]
    second_date_images.remove(second_date_qa)
    
    third_date_images = sorted(glob(selected_image_folders[2] + '/*.5070.tif'))
    third_date_images = reorder_bands(third_date_images, band_order = bands)

    third_date_qa = [x for x in third_date_images if '.QA.' in x][0]
    third_date_images.remove(third_date_qa)
    all_date_images = first_date_images + second_date_images + third_date_images
    
    print('all date images')
    print(all_date_images)
  #  print(len(all_date_images))


    valid_first, bad_pct_first, qa_first = check_qa(first_date_qa, shape)
    valid_second, bad_pct_second, qa_second = check_qa(second_date_qa, shape)
    valid_third, bad_pct_third, qa_third = check_qa(third_date_qa, shape)
    
    qa_bands = []
    qa_bands.append(qa_first)
    qa_bands.append(qa_second)
    qa_bands.append(qa_third)
    qa_bands = np.array(qa_bands).astype(np.int16)
    
  #  print(qa_bands.shape)
   # print(first_date_qa)
    assert len(all_date_images) == 3 * (len(bands) - 1)
    
    out_bands = []
    print('out_bands_loop')
    for img in all_date_images:
        with rasterio.open(img) as src:
            print(img)
            out_image, out_transform = rasterio.mask.mask(src, shape, crop=True)
            out_meta = src.meta
            out_bands.append(out_image[0])
    
    out_bands = np.array(out_bands)
    # print(out_bands.shape)
    # print(out_image.shape)

    out_meta.update({"driver": "GTiff",
                     "height": out_bands.shape[1],
                     "width": out_bands.shape[2],
                     "count": out_bands.shape[0],
                     "transform": out_transform})
    
    # get NA count for HLS
    na_count = sum(out_bands.flatten() == -1000)
    
    # reclass negative HLS values to 0
    out_bands = np.clip(out_bands, 0, None)
    
    
    
    # write HLS chip to 'chips'
    with rasterio.open(chip_dir + "chip_" + str(chip_id) + "_merged.tif", "w", **out_meta) as dest:
        dest.write(out_bands)
    # write HLS chip to 'chips_binary'
    with rasterio.open(chip_dir_binary + "chip_" + str(chip_id) + "_merged.tif", "w", **out_meta) as dest:
        dest.write(out_bands)
    # write HLS chip to 'chips_multi'
    with rasterio.open(chip_dir_multi + "chip_" + str(chip_id) + "_merged.tif", "w", **out_meta) as dest:
        dest.write(out_bands)
      
    ## write QA bands
    out_meta.update({"driver": "GTiff",
                     "height": qa_bands.shape[1],
                     "width": qa_bands.shape[2],
                     "count": qa_bands.shape[0],
                     "transform": out_transform})
    
    with rasterio.open(chip_qa_dir + "chip_" + str(chip_id) + "_qa.tif", "w", **out_meta) as dest:
        dest.write(qa_bands)  
    
        
    ## clip cdl to chip
    with rasterio.open("/data/2021_30m_cdls_clipped.tif") as src:
        out_image, out_transform = rasterio.mask.mask(src, shape, crop=True)
        out_meta = src.meta
        colormap = src.colormap(1)

    out_meta.update({"driver": "GTiff",
                     "height": out_image.shape[1],
                     "width": out_image.shape[2],
                     "transform": out_transform})
    # write CDL chip to 'chips'
    with rasterio.open(chip_dir + "chip_" + str(chip_id) + ".mask.tif", "w", **out_meta) as dest:
        dest.write(out_image)
        dest.write_colormap(1, colormap)
        
        
    # write binary  reclassed CDL chip to chips_binary
    out_image_binary = c_rcl(out_image).astype(np.uint8)
    with rasterio.open(chip_dir_binary + "chip_" + str(chip_id) + ".mask.tif", "w", **out_meta) as dest:
        dest.write(out_image_binary)
        dest.write_colormap(1, colormap)
        
    # write multiclass  reclassed CDL chip to chips_multi
    out_image_multi = c_multi(out_image).astype(np.uint8)
    with rasterio.open(chip_dir_multi + "chip_" + str(chip_id) + ".mask.tif", "w", **out_meta) as dest:
        dest.write(out_image_multi)
        dest.write_colormap(1, colormap)
    
    
    return(valid_first,
           valid_second,
           valid_third, 
           bad_pct_first,
           bad_pct_second,
           bad_pct_third,
           qa_first,
           qa_second,
           qa_third,
           na_count,
           first_image_date,
           second_image_date,
           third_image_date)
    

In [None]:
## process chips
chip_df = pd.read_csv(chip_csv)

for tile in tiles_to_chip:
    print(tile)
    chips_to_process = chip_df[chip_df.tile == tile].reset_index(drop = True)
    for k in range(len(chips_to_process)):
        current_id = chips_to_process.chip_id[k]
        chip_tile = chips_to_process.tile[k]
    #    print(current_id)
        chip_index = chip_ids.index(current_id)

        chip_feature = chips['features'][chip_index]

        shape = [chip_feature['geometry']]

        ## do we want to scale/clip reflectances?

        valid_first,  valid_second, valid_third, bad_pct_first, bad_pct_second, bad_pct_third, qa_first, qa_second, qa_third, na_count, first_image_date, second_image_date, third_image_date = process_chip(current_id, chip_tile, shape)

        chip_df_index = chip_df.index[chip_df['chip_id'] == current_id].tolist()[0]
        chip_df.at[chip_df_index, 'valid_first'] = valid_first
        chip_df.at[chip_df_index, 'valid_second'] = valid_second
        chip_df.at[chip_df_index, 'valid_third'] = valid_third
        chip_df.at[chip_df_index, 'bad_pct_first'] = bad_pct_first
        chip_df.at[chip_df_index, 'bad_pct_second'] = bad_pct_second
        chip_df.at[chip_df_index, 'bad_pct_third'] = bad_pct_third
        chip_df.at[chip_df_index, 'first_image_date'] = first_image_date
        chip_df.at[chip_df_index, 'second_image_date'] = second_image_date
        chip_df.at[chip_df_index, 'third_image_date'] = third_image_date
        chip_df['bad_pct_max'] = chip_df[['bad_pct_first', 'bad_pct_second', 'bad_pct_third']].max(axis=1)
        chip_df.at[chip_df_index, 'na_count'] = na_count
    tile_tracker = pd.read_csv(tile_tracker_csv)
    tile_tracker.loc[tile_tracker.tile == tile , 'chip'] = True
    tile_tracker.to_csv(tile_tracker_csv, index=False)
chip_df.to_csv(chip_csv, index=False)

5. filter chips

In [18]:
tile_tracker = pd.read_csv(tile_tracker_csv)
tiles_to_filter = tile_tracker[(tile_tracker.exclude == False) & (tile_tracker.chip == True) & (tile_tracker.filter_chips == False) ].tile.unique()
tiles_to_filter

array(['14SMF', '14SME', '14SMD', '14SMC', '14SMB', '14SMA', '14SNC',
       '14SNB', '14SNA', '14SNE', '14SND', '14SNF', '14SPA', '14SPC',
       '14SPE', '14SPD', '14SPF', '14SQA', '14SQC', '14SQF', '14SQE',
       '14SQD', '15STA', '15STV', '15STU', '15STT', '15STS', '15STR',
       '15SUA', '15SUV', '15SUU', '15SUS', '15SUR', '15SVA', '15SVV',
       '15SVU', '15SVT', '15SVS'], dtype=object)

In [19]:
chip_df = pd.read_csv(chip_csv)

for tile in tiles_to_filter:
    print(tile)
    filtered_chips = chip_df[(chip_df.tile == tile) & (chip_df.bad_pct_max < 5) & (chip_df.na_count == 0)].chip_id.tolist()
    print(len(filtered_chips))
    for chip_id in filtered_chips:
        chip_files = glob('/data/chips/*' + chip_id + '*')
        for file in chip_files:
            name = file.split('/')[-1]
            shutil.copyfile(file, '/data/chips_filtered/' + name)
        chip_files_b = glob('/data/chips_binary/*' + chip_id + '*')
        for file in chip_files_b:
            name = file.split('/')[-1]
            shutil.copyfile(file, '/data/chips_binary_filtered/' + name)
        chip_files_multi = glob('/data/chips_multi/*' + chip_id + '*')
        for file in chip_files_multi:
            name = file.split('/')[-1]
            shutil.copyfile(file, '/data/chips_multi_filtered/' + name)
    
    tile_tracker = pd.read_csv(tile_tracker_csv)
    tile_tracker.loc[tile_tracker.tile == tile , 'filter_chips'] = True
    tile_tracker.to_csv(tile_tracker_csv, index=False)


14SMF
35
14SME
36
14SMD
11
14SMC
16
14SMB
1
14SMA
8
14SNC
208
14SNB
160
14SNA
88
14SNE
181
14SND
175
14SNF
135
14SPA
88
14SPC
175
14SPE
186
14SPD
136
14SPF
144
14SQA
81
14SQC
114
14SQF
126
14SQE
144
14SQD
106
15STA
64
15STV
138
15STU
94
15STT
126
15STS
146
15STR
69
15SUA
160
15SUV
184
15SUU
158
15SUS
102
15SUR
66
15SVA
122
15SVV
93
15SVU
152
15SVT
105
15SVS
106


In [None]:
# valid_qa = [0, 4, 32, 36, 64, 68, 96, 100, 128, 132, 160, 164, 192, 196, 224, 228]
# qa_df_all['valid'] = qa_df_all.qa_val.isin(valid_qa)
# qa_df_all
# qa_df_all.to_csv(root_path + "_" + str(ct) + 'qa_vals.csv')

In [None]:
# valid_qa = [0, 4, 32, 36, 64, 68, 96, 100, 128, 132, 160, 164, 192, 196, 224, 228]

# qa_df_all = pd.DataFrame(columns = ["qa_val", "counts", "chip_id", 'date'])

# for chip in qa_chips[0:2]:
#     vals = xarray.open_rasterio(chip)
#     for k in range(3):
#         date_vals = vals.data[k, :, :]
#         unique, counts = np.unique(date_vals, return_counts=True)
#         qa_df = pd.DataFrame({"qa_val" : unique, "counts" : counts})
#         qa_df['pct'] = (100 *qa_df['counts'])/(224.0 * 224.0)
#         qa_df['chip_id'] = chip.split('/')[-1][12:19]
#         qa_df['date'] = str(k)
#         qa_df_all = pd.concat([qa_df_all, qa_df])
        
# #qa_df_all.to_csv(root_path + 'qa_vals_date.csv')
# qa_df_all[~ qa_df_all.qa_val.isin(valid_qa)].sort_values(['counts'], ascending = False).pct.tolist()[0]

