# HLS image query, download and chipping pipeline

## Importing Packages

In [2]:
import geopandas
import json
import shapely
import shapely.geometry
import xarray
import rasterio
import rioxarray
import os
import fiona
import urllib.request as urlreq
import pandas as pd
import numpy as np
import requests
import xmltodict
import shutil
import datetime
import boto3
import pyproj
import time

from shapely.ops import transform
from shapely.geometry import Point
from shapely.geometry import Polygon
from pystac_client import Client 
from collections import defaultdict
from glob import glob
from rasterio.enums import Resampling
from rasterio import Affine
from rasterio.crs import CRS
import matplotlib.pyplot as plt
from subprocess import Popen, PIPE
from tqdm import tqdm
from netrc import netrc
from subprocess import Popen
from platform import system
from getpass import getpass
from rasterio.session import AWSSession
from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

%matplotlib inline

## Setting folder pathes and file paths

In [3]:
##### START OPTIONS #####

## Threshold of cloud cover percentage. 
## Five is recommended for HLS raw images since less might leave to no tiles pass the filter
cloud_thresh = 5

## Root paths
root_path = "/data/" # Place to save the downloaded/reprojected/chipped image data
req_path = "/home/data/" # Place to save all the csv and geojsons within the GitHub repo
extra_files = "/data/requirements/" # Place to save things that should not be uploaded to GitHub

## File paths
query_file = req_path + "chip_bbox_task_3.geojson" # Path to the EPSG:4326 geojson
chip_file =  req_path + "chip_bbox_task_3_5070.geojson" # Path to the EPSG:5070 geojson
chip_csv = req_path + "chip_tracker.csv" # Path to the chip tracking csv
kml_file = extra_files + 'sentinel_tile_grid.kml' # Path to the sentinel grid kml file

## Save paths
## Manually creating these folders is recommended before running the pipeline
chip_dir = root_path + 'chips/' # Place to save the chipped data
tile_dir = root_path + 'tiles/' # Place to save the downloaded/reprojected tiles
chip_fmask_dir = root_path + 'chips_fmask/' # Place to save the chipped fmasks
chip_dir_filt = root_path + 'chips_filtered/' # Place to save the filtered chips

#####  END OPTIONS  #####

## Read in csvs and jsons from saved file (Only run when need to load from previously query results)

In [None]:
# Getting all saved dataframes and json
chip_df = pd.read_csv(req_path + "chip_df.csv")
with open(req_path + "chip_ids.json", 'r') as f:
    chip_ids = json.load(f)
track_df = pd.read_csv(req_path + "track_df.csv")
with open(chip_file, "r") as file:
    chips = json.load(file)
selected_tiles = pd.read_csv(req_path + "selected_tiles.csv")

## Data Processing

In [4]:
# Loading chips bounding boxes from geojson
with open(query_file, "r") as file:
    chips = json.load(file)

# Create lists about chip information to find tiles corresponding to it later
chip_ids = []
chip_x = []
chip_y = []

for item in chips['features']:
    chip_ids.append(item['properties']['id'])
    chip_x.append(item['properties']['center'][0])
    chip_y.append(item['properties']['center'][1])

In [None]:
# Save the chip_ids for chipping uses
with open(extra_files + "chip_ids.json", "w") as f:
    json.dump(chip_ids, f, indent=2)

In [5]:
# Read in sentinel kml file
fiona.drvsupport.supported_drivers['KML'] = 'rw'
tile_src = geopandas.read_file(kml_file, driver='KML')

# Create table containing information about sentinel tiles
tile_name = []
tile_x = []
tile_y = []

for tile_ind in range(tile_src.shape[0]):
    tile_name.append(tile_src.iloc[tile_ind].Name)
    tile_x.append(tile_src.iloc[tile_ind].geometry.centroid.x)
    tile_y.append(tile_src.iloc[tile_ind].geometry.centroid.y)

tile_name = np.array(tile_name)
tile_x = np.array(tile_x)
tile_y = np.array(tile_y)
tile_src = pd.concat([tile_src, tile_src.bounds], axis = 1)

In [6]:
tile_src.head(5)

Unnamed: 0,Name,Description,geometry,minx,miny,maxx,maxy
0,01CCV,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -7...,-180.0,-73.064633,180.0,-72.012478
1,01CDH,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -8...,-180.0,-83.835334,180.0,-82.79672
2,01CDJ,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -8...,-180.0,-82.939452,180.0,-81.906947
3,01CDK,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -8...,-180.0,-82.044055,180.0,-81.016439
4,01CDL,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -8...,-180.0,-81.14807,180.0,-80.124456


In [7]:
# Function that help to match the chip shape to a certain tile
def find_tile(x,y):
# Identify closest tile
    s = (tile_x - x)**2+(tile_y - y)**2
    tname = tile_name[np.argmin(s)]
    return(tname)

In [8]:
# Assign each chip a tile by using the find_tile function
chip_df = pd.DataFrame({"chip_id" : chip_ids, "chip_x" : chip_x, "chip_y" : chip_y})
chip_df['tile'] = chip_df.apply(lambda row : find_tile(row['chip_x'], row['chip_y']), axis = 1)

In [9]:
chip_df.tail(5)

Unnamed: 0,chip_id,chip_x,chip_y,tile
9995,chip_115_314,-99.060383,44.52233,14TMQ
9996,chip_148_412,-90.907704,42.470597,15TXH
9997,chip_081_204,-108.698883,46.021321,12TXS
9998,chip_237_443,-88.91072,37.022803,16SCG
9999,chip_109_315,-98.991689,44.886159,14TMQ


In [None]:
# Save dataframe to csv for later uses
chip_df.to_csv(req_path + "chip_df.csv", index=False)

In [10]:
tiles = chip_df.tile.unique().tolist()
tiles[0:5]

['13TDE', '16SDD', '13SFV', '14TNS', '14UMU']

In [11]:
len(tiles)

602

## Querying tile links based on geometry of chips

In [12]:
# Defining the API and open it in the client
STAC_URL = 'https://cmr.earthdata.nasa.gov/stac'
catalog = Client.open(f'{STAC_URL}/LPCLOUD/')

In [16]:
def get_meta_to_df(tiles, chip_df, chip_ids, cloud_thresh):
    tile_list = []
    failed_list = []
    print(f"There are a total of {len(tiles)} tiles")
    for current_tile in tqdm(tiles):
    
        tile_name = "T" + current_tile
        iter_items = 0
        
        chip_df_filt = chip_df.loc[chip_df.tile == current_tile]#.reset_index()
        first_chip_id = chip_df_filt.chip_id.iloc[0]
        first_chip_index_in_json = chip_ids.index(first_chip_id)
        roi = chips['features'][first_chip_index_in_json]['geometry']
    
        search = catalog.search(
            collections = ['HLSS30.v2.0'],
            intersects = roi,
            datetime = '2022-03-01/2022-09-30',
        )
        
        for attempt in range(5):
            try:
                num_results = search.matched()
            except Exception:
                time.sleep(1)
                continue
            else:
                break
        else:
            print(f"After {attempt} retry problem presists for tile {tile_name} with STAC-API error, check your internet connection, the geojson file and the status of NASA STAC-API and try again. Continue to next tile")
            failed_list.append(tile_name)
            continue
        
        item_collection = search.get_all_items()
    
        for i in item_collection:
            respond_code = 0
            for attempt in range(5): 
                if i.id.split('.')[2] == tile_name:
                    if i.properties['eo:cloud_cover'] <= cloud_thresh:
                        response = requests.get(i.assets['metadata'].href)
                        respond_code = response.status_code
                        if response.status_code == 200:
                            temp_xml = response.text
                            temp_xml = xmltodict.parse(temp_xml)
                            temp_dict = {"tile_id": tile_name, "cloud_cover": i.properties['eo:cloud_cover'],
                                         "date": datetime.datetime.strptime(i.properties['datetime'].split('T')[0], "%Y-%m-%d"), 
                                         "spatial_cover": int(temp_xml['Granule']['AdditionalAttributes']['AdditionalAttribute'][3]['Values']['Value']),
                                         "http_links": {"B02": i.assets['B02'].href, "B03": i.assets['B03'].href, "B04": i.assets['B04'].href,  "B8A": i.assets['B8A'].href,
                                                        "B11": i.assets['B11'].href, "B12": i.assets['B12'].href, "Fmask": i.assets['Fmask']},
                                        "s3_links": {"B02": i.assets['B02'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/'), 
                                                     "B03": i.assets['B03'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/'), 
                                                     "B04": i.assets['B04'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/'), 
                                                     "B8A": i.assets['B8A'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/'),
                                                     "B11": i.assets['B11'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/'),
                                                     "B12": i.assets['B12'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/'),
                                                     "Fmask": i.assets['Fmask'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/')}}
                            tile_list.append(temp_dict)
                            break
                        else:
                            time.sleep(1)
            if attempt == 5 and respond_code != 200:
                print(f"After {attempt} retry problem presists for tile {tile_name}, failed to fetch XML data from {i.assets['metadata'].href}, error code {respond_code}, continue to next tile.")
                failed_list.append(tile_name)
    
    return pd.DataFrame(tile_list), failed_list

In [17]:
tile_df, failed_list = get_meta_to_df(tiles[0:5], chip_df, chip_ids, cloud_thresh)

There are a total of 5 tiles


100%|█████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:56<00:00, 11.39s/it]


In [None]:
# Save to csv for later uses
tile_df.to_csv(req_path + "tile_df.csv", index=False)

In [15]:
tile_df.head()

Unnamed: 0,tile_id,cloud_cover,date,spatial_cover,http_links,s3_links
0,T13TDE,1,2022-03-14,25,{'B02': 'https://data.lpdaac.earthdatacloud.na...,{'B02': 's3:/HLSS30.020/HLS.S30.T13TDE.2022073...
1,T13TDE,0,2022-04-08,25,{'B02': 'https://data.lpdaac.earthdatacloud.na...,{'B02': 's3:/HLSS30.020/HLS.S30.T13TDE.2022098...
2,T13TDE,0,2022-04-18,25,{'B02': 'https://data.lpdaac.earthdatacloud.na...,{'B02': 's3:/HLSS30.020/HLS.S30.T13TDE.2022108...
3,T13TDE,3,2022-04-21,99,{'B02': 'https://data.lpdaac.earthdatacloud.na...,{'B02': 's3:/HLSS30.020/HLS.S30.T13TDE.2022111...
4,T13TDE,2,2022-05-26,99,{'B02': 'https://data.lpdaac.earthdatacloud.na...,{'B02': 's3:/HLSS30.020/HLS.S30.T13TDE.2022146...


## Filtering based on could and spatial coverage of the tiles we gathered earlier

In [None]:
def spatial_filtering (dataframe):
    """
        Using spatial coverage percentage to filter chips

        Args:
            dataframe: A pandas dataframe that generated from the query above
    """
    cover_list = [100, 90, 80, 70, 60, 50]
    tile_list_ft = []
    tile_list = dataframe.tile_id.unique().tolist()
    
    for tile in tqdm(tile_list):
        temp_df = dataframe[dataframe.tile_id == tile]
        for cover_pct in cover_list:
            
            temp_df_filtered = temp_df[temp_df.spatial_cover >= cover_pct]
            if len(temp_df_filtered) >= 3: # Number of timestep wish to get for each tile
                for i in range(len(temp_df_filtered)):
                    tile_list_ft.append(temp_df_filtered.iloc[i])
                break
    
    tile_df_filtered = pd.DataFrame(tile_list_ft)
    return tile_df_filtered

In [None]:
def select_scenes(dataframe):
    """
        Selecting best spatial covered scenes based on timesteps

        Args:
            dataframe: A pandas dataframe that generated by the spatial_filtering function
    """
    select_tiles = []
    tile_list = dataframe.tile_id.unique().tolist()

    for tile in tqdm(tile_list):
        temp_df = dataframe[dataframe.tile_id == tile].sort_values('date').reset_index(drop=True)
        select_tiles.extend([temp_df.iloc[0], temp_df.iloc[len(temp_df) // 2], temp_df.iloc[-1]])

    return pd.DataFrame(select_tiles).reset_index(drop=True)

In [None]:
def tile_filter_process(dataframe):
    sptial_filtered_df =  spatial_filtering(dataframe)
    time_selected_df = select_scenes(sptial_filtered_df)
    return time_selected_df

In [None]:
selected_tiles = tile_filter_process(tile_df)

In [None]:
selected_tiles.head()

In [None]:
# Save to csv for later uses
selected_tiles.to_csv(req_path + "selected_tiles.csv", index=False)

## Data downloading

### Creating netrc file on root for credentials (Run Once each docker session)

In [None]:
urs = 'urs.earthdata.nasa.gov'    # Earthdata URL endpoint for authentication
prompts = ['Enter NASA Earthdata Login Username: ',
           'Enter NASA Earthdata Login Password: ']

# Determine the OS (Windows machines usually use an '_netrc' file)
netrc_name = "_netrc" if system()=="Windows" else ".netrc"

# Determine if netrc file exists, and if so, if it includes NASA Earthdata Login Credentials
try:
    netrcDir = os.path.expanduser(f"~/{netrc_name}")
    netrc(netrcDir).authenticators(urs)[0]

# Below, create a netrc file and prompt user for NASA Earthdata Login Username and Password
except FileNotFoundError:
    homeDir = os.path.expanduser("~")
    Popen('touch {0}{2} | echo machine {1} >> {0}{2}'.format(homeDir + os.sep, urs, netrc_name), shell=True)
    Popen('echo login {} >> {}{}'.format(getpass(prompt=prompts[0]), homeDir + os.sep, netrc_name), shell=True)
    Popen('echo \'password {} \'>> {}{}'.format(getpass(prompt=prompts[1]), homeDir + os.sep, netrc_name), shell=True)
    # Set restrictive permissions
    Popen('chmod 0600 {0}{1}'.format(homeDir + os.sep, netrc_name), shell=True)

    # Determine OS and edit netrc file if it exists but is not set up for NASA Earthdata Login
except TypeError:
    homeDir = os.path.expanduser("~")
    Popen('echo machine {1} >> {0}{2}'.format(homeDir + os.sep, urs, netrc_name), shell=True)
    Popen('echo login {} >> {}{}'.format(getpass(prompt=prompts[0]), homeDir + os.sep, netrc_name), shell=True)
    Popen('echo \'password {} \'>> {}{}'.format(getpass(prompt=prompts[1]), homeDir + os.sep, netrc_name), shell=True)

### Getting temporary credentials from NASA's S3 Bucket(Run once each docker session to make sure it works)

In [None]:
# Endpoint for NASA's s3 temp credential
s3_cred_endpoint = 'https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials'

In [None]:
def get_temp_creds():
    temp_creds_url = s3_cred_endpoint
    return requests.get(temp_creds_url).json()

In [None]:
temp_creds_req = get_temp_creds()
#temp_creds_req                      # !!! BEWARE, removing the # on this line will print your temporary S3 credentials.

In [None]:
session = boto3.Session(aws_access_key_id=temp_creds_req['accessKeyId'], 
                        aws_secret_access_key=temp_creds_req['secretAccessKey'],
                        aws_session_token=temp_creds_req['sessionToken'],
                        region_name='us-west-2')

In [None]:
rio_env = rasterio.Env(AWSSession(session),
                  GDAL_DISABLE_READDIR_ON_OPEN='EMPTY_DIR',
                  GDAL_HTTP_COOKIEFILE=os.path.expanduser('~/cookies.txt'),
                  GDAL_HTTP_COOKIEJAR=os.path.expanduser('~/cookies.txt'))
rio_env.__enter__()

### Tile downloading 

In [None]:
## The download process would generate a temp credential for each tile download. 
## A multi-processed version is WIP.
def tile_download(table, from_csv = True):
    """
        Downloading tiles by reading from the metadata information gathered earlier

        Args:
            table: A pandas dataframe that generated previously
            from_csv: If the tile information is from a csv, then uses json.loads to read the dict strctured information from csv
    """
    info_list = []
    bands = ["B02","B03","B04","B8A","B11","B12","Fmask"]
    accept_tiles = np.unique(table.tile_id)
    for tile in tqdm(accept_tiles):

        temp_creds_req = get_temp_creds()
        session = boto3.Session(aws_access_key_id=temp_creds_req['accessKeyId'], 
                        aws_secret_access_key=temp_creds_req['secretAccessKey'],
                        aws_session_token=temp_creds_req['sessionToken'],
                        region_name='us-west-2')
        
        temp_tb = table[table.tile_id == tile]
        for i in range(3): # Number of timestep wish to get for each tile
            if from_csv:
                bands_dict = json.loads(temp_tb.iloc[i].s3_links.replace("'", '"'))
            else:
                bands_dict = temp_tb.iloc[i].s3_links
            for band in bands:
                temp_key = bands_dict[band].replace("s3:/", "")
                temp_sav_path = f"/data/tiles/{bands_dict[band].split('/')[2]}/{bands_dict[band].split('/')[3]}"
                os.makedirs(f"/data/tiles/{bands_dict[band].split('/')[2]}", exist_ok=True)
                if not Path(temp_sav_path).is_file():
                    session.resource('s3').Bucket('lp-prod-protected').download_file(Key = temp_key, Filename = temp_sav_path)
            temp_dict = {"tile":tile, "timestep":i, "date":temp_tb.iloc[i].date, "save_path":f"/data/tiles/{bands_dict[band].split('/')[2]}/", "filename":bands_dict["B02"].split('/')[3].replace(".B02.tif","")}
            info_list.append(temp_dict)
    return pd.DataFrame(info_list)

In [None]:
# If the process is run without loading the "selected tiles" from a saved CSV, then put from_csv = False
track_df = tile_download(selected_tiles) 

In [None]:
track_df.to_csv(req_path + "track_df.csv", index=False)

## Chipping (Run hls_reprojecting.ipynb before going into following chunks)

In [None]:
## Getting all saved dataframes and json
## Only run when continuing the process from a docker restart/When not running the pipeline at once
chip_df = pd.read_csv(req_path + "chip_df.csv")
with open(req_path + "chip_ids.json", 'r') as f:
    chip_ids = json.load(f)
track_df = pd.read_csv(req_path + "track_df.csv")
with open(chip_file, "r") as file:
    chips = json.load(file)

In [None]:
# Load the json for chipping and extract the list of tiles.
tiles_to_chip = track_df.tile.unique().tolist()
with open(chip_file, "r") as file_chip:
    chipping_js = json.load(file_chip)

### Chipping functions

In [None]:
def check_qa(qa_path, shape,  valid_qa = [0, 4, 32, 36, 64, 68, 96, 100, 128, 132, 160, 164, 192, 196, 224, 228]):
    
    """
    This function receives a path to a qa file, and a geometry. It clips the QA file to the geometry. 
    It returns the number of valid QA pixels in the geometry, and the clipped values.
    
    Assumptions: The valid_qa values are taken from Ben Mack's post:
    https://benmack.github.io/nasa_hls/build/html/tutorials/Working_with_HLS_datasets_and_nasa_hls.html
    
    Inputs:
    - qa_path: full path to reprojected QA tif file
    - shape: 'geometry' property of single polygon feature read by fiona
    - valid_qa: list of integer values that are 'valid' for QA band.
    

    
    """
    with rasterio.open(qa_path) as src:
        out_image, out_transform = rasterio.mask.mask(src, shape, crop=True)
        # print(out_image[0].shape)
        vals = out_image.flatten()
        unique, counts = np.unique(vals, return_counts=True)
        qa_df = pd.DataFrame({"qa_val" : unique, "counts" : counts})
        qa_df
        qa_df[~ qa_df.qa_val.isin(valid_qa)].sort_values(['counts'], ascending = False)
        qa_df['pct'] = (100 *qa_df['counts'])/(224.0 * 224.0)
        
        bad_qa = qa_df[~ qa_df.qa_val.isin(valid_qa)].sort_values(['counts'], ascending = False)
        if len(bad_qa) > 0:
            highest_invalid_percent = bad_qa.pct.tolist()[0]
        else: 
            highest_invalid_percent = 0
        # ncell = len(vals)
        valid_count = sum(x in valid_qa for x in vals)
        return(valid_count, highest_invalid_percent, out_image[0])


In [None]:
def process_chip(chip_id, 
                 chip_tile,
                 shape,
                 track_csv,
                 bands = ["B02", "B03", "B04", "B8A", "B11", "B12"]):
    
    """
    This function receives a chip id, HLS tile, chip geometry, and a list of bands to process. 
    
    Inputs:
    - chip_id: string of chip id, e.g. '000_001'
    - chip_tile: string of HLS tile , e.g. '15ABC'
    - shape: 'geometry' property of single polygon feature read by fiona
    - track_csv: The csv that contains path and other information. Generated earlier in the pipeline
    - bands: The bands that wish to be included in the HLS image chips
    
    The function writes out a multi-date TIF containing the bands for each of the three image dates for an HLS tile. 
    The function writes out a multi-date TIF containing the QA bands of each date.
    The function calls check_qa(), which makes assumptions about what QA pixels are valid.
    The function returns the number of valid QA pixels at each date for chip quality control purposes.
    
    """
    ## get reprojected image paths from tracking csv
    tile_info_df = track_csv[track_csv.tile == chip_tile]
    selected_image_folders = tile_info_df.save_path.to_list()

    # Check if each tile contains 3 timesteps
    assert len(selected_image_folders) == 3
    
    # Gather date information
    first_image_date = tile_info_df.iloc[0].date
    second_image_date = tile_info_df.iloc[1].date
    third_image_date = tile_info_df.iloc[2].date
    

    all_date_images = []
    all_date_qa = []
                     
    for i in range(3):
        for band in bands:
            all_date_images.append(tile_info_df.iloc[i].save_path + f"{tile_info_df.iloc[i].filename}.{band}.reproject.tif")
        all_date_qa.append(tile_info_df.iloc[i].save_path + f"{tile_info_df.iloc[i].filename}.Fmask.reproject.tif")

    valid_first, bad_pct_first, qa_first = check_qa(all_date_qa[0], shape)
    valid_second, bad_pct_second, qa_second = check_qa(all_date_qa[1], shape)
    valid_third, bad_pct_third, qa_third = check_qa(all_date_qa[2], shape)
    
    qa_bands = []
    qa_bands.append(qa_first)
    qa_bands.append(qa_second)
    qa_bands.append(qa_third)
    qa_bands = np.array(qa_bands).astype(np.uint8)
    
    # Check if each image contains timestep * band_count bands
    assert len(all_date_images) == 3 * len(bands)
    
    out_bands = []
    for img in all_date_images:
        with rasterio.open(img) as src:
            out_image, out_transform = rasterio.mask.mask(src, shape, crop=True)
            out_meta = src.meta
            out_bands.append(out_image[0])
    # Save to a numpy arraay
    out_bands = np.array(out_bands)
    # Update metadata for writing chips
    out_meta.update({"driver": "GTiff",
                     "height": out_bands.shape[1],
                     "width": out_bands.shape[2],
                     "count": out_bands.shape[0],
                     "transform": out_transform})
    
    # get NA count for HLS
    na_count = sum(out_bands.flatten() == -1000)
    
    # reclass negative HLS values to 0
    out_bands = np.clip(out_bands, 0, None)
                     
    # write HLS chips to chip_dir
    with rasterio.open(chip_dir + str(chip_id) + "_merged.tif", "w", **out_meta) as dest:
        dest.write(out_bands)


        
    ## Update metadata for QA bands chips
    out_meta.update({"driver": "GTiff",
                     "height": qa_bands.shape[1],
                     "width": qa_bands.shape[2],
                     "count": qa_bands.shape[0],
                     "transform": out_transform})
    # Write HLS chips to chip_fmask_dir
    with rasterio.open(chip_fmask_dir + str(chip_id) + "_Fmask.tif", "w", **out_meta) as dest:
        dest.write(qa_bands)  

    return(valid_first,
           valid_second,
           valid_third, 
           bad_pct_first,
           bad_pct_second,
           bad_pct_third,
           na_count,
           first_image_date,
           second_image_date,
           third_image_date)
    

### Chipping process

In [None]:
## process chips
## The failed_tiles is to exclude tiles that might be damaged during the reprojecting process
failed_tiles = []

for tile in tqdm(tiles_to_chip):
    chips_to_process = chip_df[chip_df.tile == tile[1:]].reset_index(drop = True)
    
    for k in range(len(chips_to_process)):
        current_id = chips_to_process.chip_id[k]
        chip_tile = chips_to_process.tile[k]
        chip_index = chip_ids.index(current_id)
        chip_feature = chipping_js['features'][chip_index]
        shape = [chip_feature['geometry']]
        full_tile_name = "T" + chip_tile
        
        ## do we want to scale/clip reflectances?
        try:
            valid_first, valid_second, valid_third, bad_pct_first, bad_pct_second, bad_pct_third, na_count, first_image_date, second_image_date, third_image_date = process_chip(current_id, full_tile_name, shape, track_df)
        except:
            failed_tiles.append(tile)
            break
        
        chip_df_index = chip_df.index[chip_df['chip_id'] == current_id].tolist()[0]
        chip_df.at[chip_df_index, 'valid_first'] = valid_first
        chip_df.at[chip_df_index, 'valid_second'] = valid_second
        chip_df.at[chip_df_index, 'valid_third'] = valid_third
        chip_df.at[chip_df_index, 'bad_pct_first'] = bad_pct_first
        chip_df.at[chip_df_index, 'bad_pct_second'] = bad_pct_second
        chip_df.at[chip_df_index, 'bad_pct_third'] = bad_pct_third
        chip_df.at[chip_df_index, 'first_image_date'] = first_image_date
        chip_df.at[chip_df_index, 'second_image_date'] = second_image_date
        chip_df.at[chip_df_index, 'third_image_date'] = third_image_date
        chip_df['bad_pct_max'] = chip_df[['bad_pct_first', 'bad_pct_second', 'bad_pct_third']].max(axis=1)
        chip_df.at[chip_df_index, 'na_count'] = na_count

chip_df.to_csv(req_path + "final_chip_tracker.csv", index=False)

### Selecting chips based on the na_count and such

In [None]:
final_chip_df =pd.read_csv(req_path + "final_chip_tracker.csv")

In [None]:
final_chip_df.head()

In [None]:
valid_chips_df = final_chip_df[final_chip_df.na_count.notnull()]

In [None]:
valid_chips_df.head()

In [None]:
filtered_chips = valid_chips_df[(valid_chips_df.bad_pct_max < 5) & (valid_chips_df.na_count == 0)].chip_id.tolist()
for chip_id in tqdm(filtered_chips):
    chip_paths = glob('/data/chips/*' + chip_id + '*') # Path where you store all the chips
    for path in chip_paths:
        name = path.split('/')[-1]
        shutil.copyfile(path, chip_dir_filt + name)