In [14]:
import geopandas
import json
import shapely
import shapely.geometry
import xarray
import rasterio
import rioxarray
import os
import fiona
import nasa_hls
import urllib.request as urlreq
import pandas as pd
import numpy as np
import requests
import xmltodict
import shutil
import datetime
import boto3


from shapely.ops import transform
from shapely.geometry import Point
from shapely.geometry import Polygon
from pystac_client import Client 
from collections import defaultdict
from glob import glob
from rasterio.enums import Resampling
from rasterio import Affine
from rasterio.crs import CRS
import matplotlib.pyplot as plt
from subprocess import Popen, PIPE
from tqdm import tqdm
from netrc import netrc
from subprocess import Popen
from platform import system
from getpass import getpass
from rasterio.session import AWSSession
from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

%matplotlib inline

In [15]:
##### START OPTIONS #####
cloud_thres = 5

root_path = "/data/"

## file paths
# spath = root_path + f"CDL_HLS_dataframe{yoi[0]}"
# image_index_file = root_path + f"image_index{yoi[0]}"
chip_file =  root_path + "chip_bbox.geojson"
chip_csv = root_path + "chip_tracker.csv"
kml_file = root_path + 'sentinel_tile_grid.kml'
cdl_reclass_csv = root_path + "cdl_freq.csv"
tile_tracker_csv = root_path + "tile_tracker.csv"

## folder paths
chip_dir = root_path + 'chips/'
tif_dir = root_path + 'tif/'
# chip_dir_binary = root_path + 'chips_binary/'
chip_dir_multi = root_path + 'chips_multi/'

chip_dir_filt = root_path + 'chips_filtered/'
# chip_dir_binary_filt = root_path + 'chips_binary_filtered/'
chip_dir_multi_filt = root_path + 'chips_multi_filtered/'

chip_fmask_dir = chip_dir + 'chips_fmask/'

#####  END OPTIONS  #####

In [16]:
with open(chip_file, "r") as file:
    chips = json.load(file)
    
chip_ids = []
chip_x = []
chip_y = []
for item in chips['features']:
    #print(item)
    chip_ids.append(item['properties']['id'])
    chip_x.append(item['properties']['center'][0])
    chip_y.append(item['properties']['center'][1])

In [17]:
with open("/cdl_training_data/data/chip_ids", "w") as f:
    json.dump(chip_ids, f, indent=2)

In [6]:
# Load the HLS tiles and place there coordinates into a numpy array for processing later
fiona.drvsupport.supported_drivers['KML'] = 'rw'
tile_src = geopandas.read_file(kml_file, driver='KML')
tile_name = []
tile_x = []
tile_y = []
for tile_ind in range(tile_src.shape[0]):
    tile_name.append(tile_src.iloc[tile_ind].Name)
    tile_x.append(tile_src.iloc[tile_ind].geometry.centroid.x)
    tile_y.append(tile_src.iloc[tile_ind].geometry.centroid.y)
tile_name = np.array(tile_name)
tile_x = np.array(tile_x)
tile_y = np.array(tile_y)
tile_src = pd.concat([tile_src, tile_src.bounds], axis = 1)
tile_src.head(5)

Unnamed: 0,Name,Description,geometry,minx,miny,maxx,maxy
0,01CCV,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -7...,-180.0,-73.064633,180.0,-72.012478
1,01CDH,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -8...,-180.0,-83.835334,180.0,-82.79672
2,01CDJ,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -8...,-180.0,-82.939452,180.0,-81.906947
3,01CDK,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -8...,-180.0,-82.044055,180.0,-81.016439
4,01CDL,TILE PROPERTIES<br><table border=0 cellpadding...,GEOMETRYCOLLECTION Z (POLYGON Z ((180.00000 -8...,-180.0,-81.14807,180.0,-80.124456


In [8]:
def find_tile(x,y):
# Identify closest tile
    s = (tile_x - x)**2+(tile_y - y)**2
    tname = tile_name[np.argmin(s)]
    return(tname)

In [9]:
chip_df = pd.DataFrame({"chip_id" : chip_ids, "chip_x" : chip_x, "chip_y" : chip_y})
chip_df['tile'] = chip_df.apply(lambda row : find_tile(row['chip_x'], row['chip_y']), axis = 1)
chip_df.tail(5)

Unnamed: 0,chip_id,chip_x,chip_y,tile
4995,chip_090_484,-84.446559,45.575077,16TFR
4996,chip_177_428,-89.75122,40.674821,16TBL
4997,chip_106_429,-89.268425,44.931751,16TCQ
4998,chip_312_160,-109.614541,31.902541,12SXA
4999,chip_198_312,-99.007068,39.532747,14SMJ


In [10]:
chip_df.to_csv(root_path + "chip_df.csv", index=False)

In [11]:
tiles = chip_df.tile.unique().tolist()
tiles[0:5]

['13TDE', '16SDD', '13SFV', '14TNS', '14UMU']

In [12]:
STAC_URL = 'https://cmr.earthdata.nasa.gov/stac'
catalog = Client.open(f'{STAC_URL}/LPCLOUD/')

In [19]:
# Original 5 percentage query
tile_list = []
print(f"There are a total of {len(tiles)} tiles")
tile_iter = 0
for current_tile in tiles:

    chip_df_filt = chip_df.loc[chip_df.tile == current_tile]#.reset_index()
    first_chip_id = chip_df_filt.chip_id.iloc[0]
    first_chip_index_in_json = chip_ids.index(first_chip_id)
    roi = chips['features'][first_chip_index_in_json]['geometry']

    search = catalog.search(
        collections = ['HLSS30.v2.0'],
        intersects = roi,
        datetime = '2022-03-01/2022-09-30',
    ) 
    
    num_results = search.matched()
    item_collection = search.get_all_items()
    
    tile_name = "T" + current_tile
    iter_items = 0
    for i in tqdm(item_collection ,desc=f"({tile_iter}/{len(tiles)})"):
        if i.id.split('.')[2] == tile_name:
            if i.properties['eo:cloud_cover'] <= cloud_thres:
                response = requests.get(i.assets['metadata'].href)
                if response.status_code == 200:
                    temp_xml = response.text
                    temp_xml = xmltodict.parse(temp_xml)
                    temp_dict = {"tile_id": tile_name, "cloud_cover": i.properties['eo:cloud_cover'],
                                 "date": datetime.datetime.strptime(i.properties['datetime'].split('T')[0], "%Y-%m-%d"), 
                                 "spatial_cover": int(temp_xml['Granule']['AdditionalAttributes']['AdditionalAttribute'][3]['Values']['Value']),
                                 "http_links": {"B02": i.assets['B02'].href, "B03": i.assets['B03'].href, "B04": i.assets['B04'].href,  "B8A": i.assets['B8A'].href,
                                                "B11": i.assets['B11'].href, "B12": i.assets['B12'].href, "Fmask": i.assets['Fmask']},
                                "s3_links": {"B02": i.assets['B02'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/'), 
                                             "B03": i.assets['B03'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/'), 
                                             "B04": i.assets['B04'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/'), 
                                             "B8A": i.assets['B8A'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/'),
                                             "B11": i.assets['B11'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/'),
                                             "B12": i.assets['B12'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/'),
                                             "Fmask": i.assets['Fmask'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/', 's3:/')}}
                    tile_list.append(temp_dict)
                    iter_items += 1
                else: 
                    assert False, f"Failed to fetch XML from {i.assets['metadata'].href}. Error code: {response.status_code}"
            
    tile_iter += 1
    #print(f"Information for tile {tile_name} is collected, a total of {iter_items} out of {num_results} tiles pass the filter ({tile_iter}/{len(tiles)})")

    
tile_df = pd.DataFrame(tile_list)

There are a total of 491 tiles


(0/491): 100%|██████████████████████████████████████████████████████████████████████████| 81/81 [00:04<00:00, 19.56it/s]
(1/491): 100%|██████████████████████████████████████████████████████████████████████████| 36/36 [00:01<00:00, 31.32it/s]
(2/491): 100%|██████████████████████████████████████████████████████████████████████████| 85/85 [00:08<00:00, 10.60it/s]
(3/491): 100%|██████████████████████████████████████████████████████████████████████████| 70/70 [00:09<00:00,  7.49it/s]
(4/491): 100%|████████████████████████████████████████████████████████████████████████| 130/130 [00:16<00:00,  7.79it/s]
(5/491): 100%|████████████████████████████████████████████████████████████████████████| 131/131 [00:08<00:00, 14.88it/s]
(6/491): 100%|██████████████████████████████████████████████████████████████████████████| 77/77 [00:03<00:00, 24.93it/s]
(7/491): 100%|████████████████████████████████████████████████████████████████████████| 124/124 [00:03<00:00, 36.29it/s]
(8/491): 100%|██████████████████

In [96]:
tile_df.to_csv(root_path + "tile_df.csv", index=False)

In [86]:
tile_df = pd.read_csv("/data/tile_df.csv")

In [None]:
tile_df

In [88]:
tile_df[tile_df.tile_id == "T14SMF"]

Unnamed: 0,tile_id,cloud_cover,date,spatial_cover,links,month
0,T14SMF,0,2021-03-03,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,3
1,T14SMF,0,2021-03-28,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,3
2,T14SMF,5,2021-05-07,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,5
3,T14SMF,0,2021-06-16,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,6
4,T14SMF,0,2021-07-31,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,7
5,T14SMF,1,2021-08-10,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,8
6,T14SMF,0,2021-08-25,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,8
7,T14SMF,0,2021-09-09,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,9
8,T14SMF,0,2021-09-19,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,9
9,T14SMF,0,2021-09-29,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,9


In [89]:
len(np.unique(tile_df.tile_id))

42

In [90]:
def spatial_filtering (dataframe):
    """
        Using spatial coverage percentage to filter chips

        Args:
            dataframe: A pandas dataframe that generated previously
    """
    cover_list = [100, 90, 80, 70, 60, 50]
    tile_list_ft = []
    tile_list = dataframe.tile_id.unique().tolist()
    
    for tile in tqdm(tile_list):
        temp_df = dataframe[dataframe.tile_id == tile]
        for cover_pct in cover_list:
            
            temp_df_filtered = temp_df[temp_df.spatial_cover >= cover_pct]
            if len(temp_df_filtered) >= 3:
                for i in range(len(temp_df_filtered)):
                    tile_list_ft.append(temp_df_filtered.iloc[i])
                break
    
    tile_df_filtered = pd.DataFrame(tile_list_ft)
    return tile_df_filtered

In [91]:
# check_spatial_cover(tile_df, 90)
cover_df = spatial_filtering(tile_df)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 629.48it/s]


In [92]:
def select_scenes(dataframe):
    select_tiles = []
    tile_list = dataframe.tile_id.unique().tolist()

    for tile in tqdm(tile_list):
        temp_df = dataframe[dataframe.tile_id == tile].sort_values('date').reset_index(drop=True)
        select_tiles.extend([temp_df.iloc[0], temp_df.iloc[len(temp_df) // 2], temp_df.iloc[-1]])

    return pd.DataFrame(select_tiles).reset_index(drop=True)

In [93]:
selected_tiles = select_scenes(cover_df)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 848.01it/s]


In [94]:
selected_tiles.head()

Unnamed: 0,tile_id,cloud_cover,date,spatial_cover,links,month
0,T14SMF,0,2021-03-03,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,3
1,T14SMF,1,2021-08-10,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,8
2,T14SMF,0,2021-09-29,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,9
3,T14SME,0,2021-03-03,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,3
4,T14SME,0,2021-08-10,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,8


In [95]:
selected_tiles[selected_tiles.tile_id == "T14SME"]

Unnamed: 0,tile_id,cloud_cover,date,spatial_cover,links,month
3,T14SME,0,2021-03-03,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,3
4,T14SME,0,2021-08-10,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,8
5,T14SME,0,2021-09-19,100,{'B02': 's3://lp-prod-protected/HLSS30.020/HLS...,9


In [63]:
selected_tiles.iloc[0].s3_links['Fmask'].split("/")

AttributeError: 'Series' object has no attribute 's3_links'