# GBOV data processing

In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import geojson
import time
from shapely.geometry import Polygon
from sentinelsat import SentinelAPI, read_geojson, geojson_to_wkt
from datetime import date, timedelta, datetime
from tqdm.auto import tqdm

tqdm.pandas()

  from pandas import Panel


In [2]:
def compute_LAI(x):
    """
    Computes LAI based on upwards and downwards measurements and filters on quality flags.
    For some sites (i.e. Woodworth), only upwards measurements are available, these are still valid measurements.
    """
        
    # Filter out measurements with high impact issues (flag>=16)
    # See page 22 of https://gbov.acri.fr/public/docs/products/2019-11/GBOV-ATBD-RM4-RM6-RM7_v2.0-Vegetation.pdf
    if x["up_flag"] >=16 or x["down_flag"] >=16:
        x["LAI_Miller"] = np.nan
        x["LAI_Warren"] = np.nan
    
    # LAI_Miller
    if x["LAI_Miller_up"] >= 0 and x["LAI_Miller_down"] >=0:
        x["LAI_Miller"] = x["LAI_Miller_up"] + x["LAI_Miller_down"]
    elif x["LAI_Miller_up"] < 0:
        x["LAI_Miller"] = x["LAI_Miller_down"]  
    elif x["LAI_Miller_down"] < 0:
        x["LAI_Miller"] = x["LAI_Miller_up"]
    else:
        x["LAI_Miller"] = np.nan
    
    # LAI_Warren
    if x["LAI_Warren_up"] >= 0 and x["LAI_Warren_down"] >=0:
        x["LAI_Warren"] = x["LAI_Warren_up"] + x["LAI_Warren_down"]
    elif x["LAI_Warren_up"] < 0:
        x["LAI_Warren"] = x["LAI_Warren_down"]
    elif x["LAI_Warren_down"] < 0:
        x["LAI_Warren"] = x["LAI_Warren_up"]
    else:
        x["LAI_Warren"] = np.nan
    return x

In [3]:
def get_tiles(df, path, geojson_file, margin=0.001):
    aoi_path = os.path.join(path, str(df["Site"]+"_"+geojson_file))
    
    # Create mini polygon (overlap does not work for points)
    lat = df["Lon_IS"]
    lon = df["Lat_IS"]
    lat_list = [lat-margin, lat+margin, lat+margin, lat-margin]
    lon_list = [lon-margin, lon-margin, lon+margin, lon+margin]

    # Create Polygon object
    features = []
    polygon = geojson.Polygon([list(zip(lon_list, lat_list))])

    features = [geojson.Feature(geometry=polygon,
                            properties={})]

    # Store polygon as geojson file
    with open(aoi_path, 'w', encoding='utf8') as fp:
        geojson.dump(geojson.FeatureCollection(features), fp, sort_keys=True, ensure_ascii=False)

        
    # Get tiles overlapping with polygon
    print(polygon)
    overlap = Sentinel2Overlap(aoi_path)
        
    tiles = overlap.overlap(limit=0.00001)

    return tiles

def get_all_tiles(df, path, geojson_file):
    #Groupby site
    temp_df = df[["Site", "Lat_IS", "Lon_IS"]].groupby("Site").max().reset_index()
    site_tile_map = {}
    for i, row in tqdm(temp_df.iterrows()):
        print(row)
        tiles = get_tiles(row, path, geojson_file)
        print(tiles)
        site_tile_map[row["Site"]] = str(tiles)
    
    df["tiles"] = df.apply(lambda x: site_tile_map[x["Site"]], axis=1)
    return df
        

def aggregate_files_into_df(path):
    """
    Combine all individual .csv files into one pandas Dataframe
    """
    
    # Select all csv files
    csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]
        
    # For each file:
    df = []
    for csv_file in tqdm(csv_files, desc="Creating dataframe from .csv files"):
        # Read data and append to list
        filename = os.path.join(path, csv_file)
        temp_df = pd.read_csv(filename, index_col=None, header=0, sep=";")
#         temp_df["tiles"] = str(get_tiles(temp_df.iloc[0], path, "temp.geojson"))
        df.append(temp_df)

    # Convert list of dataframes to dataframe
    df = pd.concat(df, axis=0, ignore_index=True)
    
        
    #Add LAI columns and filter on quality
    df = df.apply(compute_LAI, axis=1)
    
    #Add datetime
    df["datetime"] = pd.to_datetime(df["TIME_IS"])
    

    
    return df

In [4]:
from xml.etree import ElementTree

import keytree
from shapely.geometry import Point, shape

In [5]:
def get_tiles_from_df(row, kmlns, elems):
    # Here's our point of interest
    p = Point(row["Lon_IS"], row["Lat_IS"])

    # Filter polygon elements using this lambda (anonymous function)
    # keytree.geometry() makes a GeoJSON-like geometry object from an
    # element and shape() makes a Shapely object of that.
    hits = filter(
        lambda e: shape(keytree.geometry(e.find(".//{%s}Polygon" % kmlns))).contains(p),
        elems )
    # hits is a list of lists of polygon objects
    
    tiles = []
    for hit in hits:
        tiles.append((hit.find("{%s}name" % kmlns).text))

    return str(tiles)

In [15]:
def find_tiles(df):
    # Parse KML file
    with open('data/S2A_grid.kml', 'r') as file:
        data = file.read()
    
    tree = ElementTree.fromstring(data)
    
    kmlns = tree.tag.split('}')[0][1:]

    # Find all Polygon elements anywhere in the doc
    elems = tree.findall(".//{%s}Placemark" % kmlns)
    
    # Group df by site
    temp_df = df[["Site", "Lat_IS", "Lon_IS"]].groupby("Site").max()
    
    # For each site, get tile from KML
    
    temp_df["tiles"] = temp_df.progress_apply(get_tiles_from_df, args=(kmlns, elems), axis=1)
    
    # Join dataframes, create column tiles in original df
    df = df.join(temp_df.drop(columns=["Lon_IS", "Lat_IS"]), on="Site", how="left")
    
    # Create date_start, date_end and first_tile columns
    df["date"] = df["datetime"].apply(lambda x: x.date())
    df["date_start"] = df["datetime"].apply(lambda x: x.date() - timedelta(days=5))
    df["date_end"] = df["datetime"].apply(lambda x: x.date() + timedelta(days=5))
#     df["first_tile"] = df["tiles"].apply(lambda x:x.split(",")[0][2:-1])
    df["first_tile"] = df["tiles"].apply(lambda x: ''.join(c for c in x.split(",")[0] if c not in '\'[]'))
    df["second_tile"] = df["tiles"].apply(lambda x: ''.join(c for c in x.split(",")[1] if c not in '\'[]') if len(x.split(","))>1 else None)
    lambda x: True if x % 2 == 0 else False

    return df


In [16]:
def find_sentinel_files(df, margin=0.0001):

    api = SentinelAPI('vneuteboom', 'asfdfaqee43trgefsagadf', 'https://scihub.copernicus.eu/dhus')

    temp_df = df[["Site", "Lat_IS", "Lon_IS"]].groupby("Site").max()
    file_list = []
    for i, row in temp_df.iterrows():
        # Create mini polygon (overlap does not work for points)
        lat = row["Lon_IS"]
        lon = row["Lat_IS"]
        aoi = "{}, {}".format(lon, lat)
        products = api.query(aoi,
                             date=('20151219', date(2015, 12, 29)),
                             platformname='Sentinel-2',
                             cloudcoverpercentage=(0, 30))
        
        file_list.append(products)
        time.sleep(1)
    return file_list

In [2]:
def get_plot_coordinates(df, shapefile="data/shapefiles/NEON_TOS_Plot_Centroids.shp"):
    # Read NEON plot shapefile
    # From: https://www.arcgis.com/home/item.html?id=73e3e0b777d344eca88573ccd21b19e9
    df_shp = gpd.read_file(shapefile)
    
    # Filter on plotID where DHP was used
    df_shp = df_shp[df_shp['appMods'].str.contains('dhp', na = False)] 
    
    # Merge with df
    cols = ["plotID", "longitude", "latitude"]
    df = df.merge(df_shp[cols], how="inner", left_on="PLOT_ID", right_on="plotID")
    return df
    

In [1]:
def main():
    path = "data/GBOV_RM07"
    
    # Create in situ dataset
    df = aggregate_files_into_df(path)
    df = find_tiles(df)
    df = get_plot_coordinates(df)   
    
    return df

In [18]:
df = main()

HBox(children=(FloatProgress(value=0.0, description='Creating dataframe from .csv files', max=460.0, style=Pro…




HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))




In [19]:
df

Unnamed: 0,GBOV_ID,Site,GROUND_DATA_PI,GROUND_DATA_PIs_Email,GBOV_Email,Network,Elevation,IGBP_class,Lat_IS,Lon_IS,...,down_flag,LAI_Miller,LAI_Warren,datetime,tiles,date,date_start,date_end,first_tile,second_tile
0,GBOV_RM7_958,BartlettExperimentalForest,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,232,Mixed Forest,44.063901,-71.287308,...,0,6.602,5.293,2016-08-29 00:00:00+00:00,"['18TYP', '19TCJ']",2016-08-29,2016-08-24,2016-09-03,18TYP,19TCJ
1,GBOV_RM7_979,BartlettExperimentalForest,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,232,Mixed Forest,44.063901,-71.287308,...,8,6.832,5.371,2016-08-24 00:00:00+00:00,"['18TYP', '19TCJ']",2016-08-24,2016-08-19,2016-08-29,18TYP,19TCJ
2,GBOV_RM7_910,BartlettExperimentalForest,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,232,Mixed Forest,44.063901,-71.287308,...,0,6.473,5.336,2016-08-23 00:00:00+00:00,"['18TYP', '19TCJ']",2016-08-23,2016-08-18,2016-08-28,18TYP,19TCJ
3,GBOV_RM7_934,BartlettExperimentalForest,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,232,Mixed Forest,44.063901,-71.287308,...,0,6.252,5.290,2016-08-29 00:00:00+00:00,"['18TYP', '19TCJ']",2016-08-29,2016-08-24,2016-09-03,18TYP,19TCJ
4,GBOV_RM7_973,BartlettExperimentalForest,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,232,Mixed Forest,44.063901,-71.287308,...,8,6.550,5.410,2016-08-18 00:00:00+00:00,"['18TYP', '19TCJ']",2016-08-18,2016-08-13,2016-08-23,18TYP,19TCJ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6031,GBOV_RM7_808,Woodworth,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,579,Croplands,47.128231,-99.241364,...,0,0.782,0.529,2019-08-20 13:22:00+00:00,['14TMT'],2019-08-20,2019-08-15,2019-08-25,14TMT,
6032,GBOV_RM7_808,Woodworth,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,579,Croplands,47.128231,-99.241364,...,0,0.396,0.311,2019-09-04 14:11:00+00:00,['14TMT'],2019-09-04,2019-08-30,2019-09-09,14TMT,
6033,GBOV_RM7_808,Woodworth,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,579,Croplands,47.128231,-99.241364,...,0,0.530,0.383,2019-09-18 10:29:00+00:00,['14TMT'],2019-09-18,2019-09-13,2019-09-23,14TMT,
6034,GBOV_RM7_808,Woodworth,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,579,Croplands,47.128231,-99.241364,...,0,0.857,0.650,2019-10-04 09:01:00+00:00,['14TMT'],2019-10-04,2019-09-29,2019-10-09,14TMT,


In [20]:
df.loc['2017-03-28':]

Unnamed: 0,GBOV_ID,Site,GROUND_DATA_PI,GROUND_DATA_PIs_Email,GBOV_Email,Network,Elevation,IGBP_class,Lat_IS,Lon_IS,...,down_flag,LAI_Miller,LAI_Warren,datetime,tiles,date,date_start,date_end,first_tile,second_tile
2018,GBOV_RM7_1566,JonesEcologicalResearchCenter,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,44,Croplands,31.194839,-84.468777,...,0,4.780,3.950,2019-08-26 17:43:00+00:00,['16RGV'],2019-08-26,2019-08-21,2019-08-31,16RGV,
2019,GBOV_RM7_1563,JonesEcologicalResearchCenter,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,44,Croplands,31.194839,-84.468777,...,0,4.110,2.650,2019-08-20 13:28:00+00:00,['16RGV'],2019-08-20,2019-08-15,2019-08-25,16RGV,
2020,GBOV_RM7_1542,JonesEcologicalResearchCenter,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,44,Croplands,31.194839,-84.468777,...,0,2.700,2.280,2019-08-20 07:06:00+00:00,['16RGV'],2019-08-20,2019-08-15,2019-08-25,16RGV,
2021,GBOV_RM7_1521,JonesEcologicalResearchCenter,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,44,Croplands,31.194839,-84.468777,...,0,2.100,1.880,2019-08-20 19:19:00+00:00,['16RGV'],2019-08-20,2019-08-15,2019-08-25,16RGV,
2022,GBOV_RM7_1572,JonesEcologicalResearchCenter,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,44,Croplands,31.194839,-84.468777,...,0,2.430,2.330,2019-08-26 18:00:00+00:00,['16RGV'],2019-08-26,2019-08-21,2019-08-31,16RGV,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6031,GBOV_RM7_808,Woodworth,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,579,Croplands,47.128231,-99.241364,...,0,0.782,0.529,2019-08-20 13:22:00+00:00,['14TMT'],2019-08-20,2019-08-15,2019-08-25,14TMT,
6032,GBOV_RM7_808,Woodworth,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,579,Croplands,47.128231,-99.241364,...,0,0.396,0.311,2019-09-04 14:11:00+00:00,['14TMT'],2019-09-04,2019-08-30,2019-09-09,14TMT,
6033,GBOV_RM7_808,Woodworth,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,579,Croplands,47.128231,-99.241364,...,0,0.530,0.383,2019-09-18 10:29:00+00:00,['14TMT'],2019-09-18,2019-09-13,2019-09-23,14TMT,
6034,GBOV_RM7_808,Woodworth,Courtney Meier,cmeier@battelleecology.org,support-copernicus-gbov@acri-st.fr,NEON,579,Croplands,47.128231,-99.241364,...,0,0.857,0.650,2019-10-04 09:01:00+00:00,['14TMT'],2019-10-04,2019-09-29,2019-10-09,14TMT,


In [21]:
df.loc['2017-03-28':].to_csv("GBOV_LAI_RM_2017-now.csv")

In [22]:
unique_queries = df.groupby(["date_start", "date_end", "first_tile"])
pd.DataFrame(unique_queries.groups.keys(), columns=["date_start", "date_end", "first_tile"])

Unnamed: 0,date_start,date_end,first_tile
0,2013-02-06,2013-02-16,17RMM
1,2013-04-12,2013-04-22,17RLN
2,2013-04-16,2013-04-26,17RMM
3,2013-04-17,2013-04-27,17RMM
4,2013-04-23,2013-05-03,17RLN
...,...,...,...
2117,2019-11-06,2019-11-16,19QGV
2118,2019-11-09,2019-11-19,19QFA
2119,2019-11-15,2019-11-25,17RLN
2120,2019-11-20,2019-11-30,19QGV
