In [None]:
!pip install requests
!pip install xmltodict

In [147]:
import geopandas
import json
import shapely
import shapely.geometry
import xarray
import rasterio as rio
import rioxarray
import os
import fiona
import nasa_hls
import urllib.request as urlreq
import pandas as pd
import numpy as np
import requests
import xmltodict
import shutil
import datetime

from shapely.ops import transform
from shapely.geometry import Point
from shapely.geometry import Polygon
from pystac_client import Client 
from collections import defaultdict
from glob import glob
from rasterio.enums import Resampling
from rasterio import Affine
from rasterio.crs import CRS
import matplotlib.pyplot as plt
from subprocess import Popen, PIPE
from tqdm import tqdm
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

%matplotlib inline


In [None]:
##### START OPTIONS #####
yoi = [2021]
#toi = ['15STT']
cloud_thres = 5
valid_months = [3,4,5,6,7,8,9]
root_path = "/data/"

## file paths
spath = root_path + f"CDL_HLS_dataframe{yoi[0]}"
image_index_file = root_path + f"image_index{yoi[0]}"
chip_file =  root_path + "chip_bbox.geojson"
chip_csv = root_path + "chip_tracker.csv"
kml_file = root_path + 'sentinel_tile_grid.kml'
cdl_reclass_csv = root_path + "cdl_freq.csv"
tile_tracker_csv = root_path + "tile_tracker.csv"

## folder paths
chip_dir = root_path + 'chips/'
tif_dir = root_path + 'tif/'
chip_dir_binary = root_path + 'chips_binary/'
chip_dir_multi = root_path + 'chips_multi/'

chip_dir_filt = root_path + 'chips_filtered/'
chip_dir_binary_filt = root_path + 'chips_binary_filtered/'
chip_dir_multi_filt = root_path + 'chips_multi_filtered/'

chip_qa_dir = root_path + 'chips_qa/'

#####  END OPTIONS  #####

make folders if needed

In [None]:
dirs_to_make = [chip_dir, chip_dir_binary, chip_qa_dir]
for folder in dirs_to_make:
    try:
        os.makedirs(folder)
    except FileExistsError:
        # directory already exists
        print('pass')
        pass

0 determine HLS tiles

In [None]:
with open("/cdl_training_data/data/chip_bbox.geojson", "r") as file:
    chips = json.load(file)
    
chip_ids = []
chip_x = []
chip_y = []
for item in chips['features']:
    #print(item)
    chip_ids.append(item['properties']['id'])
    chip_x.append(item['properties']['center'][0])
    chip_y.append(item['properties']['center'][1])


#chip_ids = a.fea
#print(a['features'][0]['properties']['center'])

In [None]:
# chips['features'][0]['geometry']['coordinates'][0] = chips['features'][0]['geometry']['coordinates'][0][::-1]   check with Hamed

In [None]:
# Load the HLS tiles and place there coordinates into a numpy array for processing later

fiona.drvsupport.supported_drivers['KML'] = 'rw'
tile_src = geopandas.read_file(kml_file, driver='KML')
tile_name = []
tile_x = []
tile_y = []
for tile_ind in range(tile_src.shape[0]):
    tile_name.append(tile_src.iloc[tile_ind].Name)
    tile_x.append(tile_src.iloc[tile_ind].geometry.centroid.x)
    tile_y.append(tile_src.iloc[tile_ind].geometry.centroid.y)
tile_name = np.array(tile_name)
tile_x = np.array(tile_x)
tile_y = np.array(tile_y)
tile_src = pd.concat([tile_src, tile_src.bounds], axis = 1)
#del tile_src
tile_src.head(5)

In [None]:
def find_tile(x,y):
# Identify closest tile
    s = (tile_x - x)**2+(tile_y - y)**2
    tname = tile_name[np.argmin(s)]
    return(tname)

initialize chip tracker csv

In [None]:
chip_df = pd.DataFrame({"chip_id" : chip_ids, "chip_x" : chip_x, "chip_y" : chip_y})
chip_df['tile'] = chip_df.apply(lambda row : find_tile(row['chip_x'], row['chip_y']), axis = 1)
chip_df.tail(5)

In [None]:
## write to csv
check_file = glob(chip_csv)
if len(check_file) == 0:
    chip_df.to_csv(chip_csv, index=False)
else:
    print('file exists')

In [None]:
tiles = chip_df.tile.unique().tolist()
tiles[0:5]

In [None]:
chip_df[chip_df.tile == '01SBU'].head(5)

In [None]:
tiles.remove('01SBU')
tiles

In [None]:
tile_tracker = pd.DataFrame({"tile":tiles})
tile_tracker['exclude'] = False
tile_tracker['tif_download'] = False
tile_tracker['tif_reproject'] = False
tile_tracker['chip'] = False
tile_tracker['filter_chips'] = False
# tile_tracker.head(50)

In [None]:
STAC_URL = 'https://cmr.earthdata.nasa.gov/stac'
catalog = Client.open(f'{STAC_URL}/LPCLOUD/')

In [None]:
tile_list = []
print(f"There are a total of {len(tiles)} tiles")
tile_iter = 0
for current_tile in tiles:
    ## find single chip for current tile
    chip_df_filt = chip_df.loc[chip_df.tile == current_tile]#.reset_index()
    # print(chip_df_filt.head(5))
    first_chip_id = chip_df_filt.chip_id.iloc[0]
    # print(first_chip_id)
    first_chip_index_in_json = chip_ids.index(first_chip_id)
    # print(first_chip_index_in_json)
    roi = chips['features'][first_chip_index_in_json]['geometry']
    # print(roi)
    search = catalog.search(
        collections = ['HLSS30.v2.0'],
        intersects = roi,
        datetime = '2021-03/2021-09',
    ) 
    num_results = search.matched()
    item_collection = search.get_all_items()
    tile_name = "T" + current_tile
    iter_items = 0
    print(f"Gathering information for tile {tile_name}")
    print(f"There are a total of {num_results} matching tiles for chip_{first_chip_id}'s geometry")
    
    for i in tqdm(item_collection):
        if i.id.split('.')[2] == tile_name:
            if i.properties['eo:cloud_cover'] <= 5:
                response = requests.get(i.assets['metadata'].href)
                if response.status_code == 200:
                    temp_xml = response.text
                    temp_xml = xmltodict.parse(temp_xml)
                    # print(temp_xml['Granule']['AdditionalAttributes']['AdditionalAttribute'][3]['Values']['Value'])
                    temp_dict = {"tile_id": tile_name, "cloud_cover": i.properties['eo:cloud_cover'],
                                 "date": datetime.datetime.strptime(i.properties['datetime'].split('T')[0], "%Y-%m-%d"), 
                                 "spatial_cover": temp_xml['Granule']['AdditionalAttributes']['AdditionalAttribute'][3]['Values']['Value'],
                                 "links": {"B02": i.assets['B02'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/', 's3://'), 
                                           "B03": i.assets['B03'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/', 's3://'), 
                                           "B04": i.assets['B04'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/', 's3://'), 
                                           "B8A": i.assets['B8A'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/', 's3://'), 
                                           "Fmask": i.assets['Fmask'].href.replace('https://data.lpdaac.earthdatacloud.nasa.gov/', 's3://')}}
                    tile_list.append(temp_dict)
                    iter_items += 1
                else: 
                    assert False, f"Failed to fetch XML from {i.assets['metadata'].href}. Error code: {response.status_code}"

    print(f"Information for tile {tile_name} is collected, a total of {iter_items} out of {num_results} tiles pass the filter")
    tile_iter += 1
    print(f"{tile_iter}/{len(tiles)}")
tile_df = pd.DataFrame(tile_list)

In [None]:
tile_df