## Prepare samples using GEE

### Setup
For each site, extract the full time series of Sentinel-1,NDVI data from the GEE. Note: if the output csv files already exist they are assumed to be correct and are not over-written.

Note: Proxy was set for the well known reason in China and you may not need it. Also check the proxy in the utils.py

In [7]:
import os
import pandas as pd
import numpy as np
import time
import ee
import utils_data_pre

In [7]:
# ee.Authenticate(force = True) # authenticate the gee account
# ee.Initialize()

Set the parameters, paths etc.

In [12]:
HOME_DATA_DIR = '/mnt/data2tb/Transfer-DenseSM-E_pack/1km_global_data/india' # the dir of the region we want to get the data from 
# Date range for Sentinel-1 data
START_DATE = "2021-01-01" #"2016-01-01" 
END_DATE =  "2022-12-31"  #"2023-11-30"  
# Date range for NDVI and weather data,one year preceding the Sentinel-1
START_DATE_NDVI = "2019-12-26" # Đáng lẽ là 2020-01-01, nhưng nới rộng thời gian thêm vài ngày để tránh bị miss     
END_DATE_NDVI = "2023-01-08"  # Đáng lẽ là 2022-12-21, nhưng nới rộng thời gian thêm vài ngày để tránh bị miss 
#Global setups, dir, path
save_to_disk = False # No temporal files
SM_SITES = os.path.join(HOME_DATA_DIR, "points/tree_grass_crops_site_info.csv") # path to csv file of the site informaiton 
dir_to_site_sm = os.path.join(HOME_DATA_DIR, "tree_grass_crops_csv_filtered") # the path to the soil moisture of stations
dir_to_site_samples = os.path.join(HOME_DATA_DIR, "deletethis") # the path to the output data
os.makedirs(dir_to_site_samples, exist_ok= True)

Read the sites information and determine the grid size

Note: Chú ý để thay đổi giá trị grid_size.

In [13]:
# Read the site information, includ the lat, lon, and the site name
sites = pd.read_csv(SM_SITES, float_precision="high")

# if resolution = 1km, then the grid size = 1.0, if resolution = 100, then grid size = 0.1
grid_size = 1.0 #km
pobj=utils_data_pre.grids_4_a_region(4326,grid_size) # determine the grid size

## A loop to prepare the input data of each site

##### 1 Create the grid polygon covering a site in both EASE2.0 and WGS84
##### 2 Extract Sentienl-1, soil texture, terrain, NDVI, precipition, temperature etc. Check the utils for the details
##### 3 Concatenate all data
##### 4 Extract the surface soil moisture of the site

#### Note: 
* 'CHINA_100m': 100m dataset from Planet for the China's region.
* 'CHINA_1km' : 1km dataset from NSIDC for the China's region.
* 'INDIA_100m': 100m dataset from Planet for the India's region. 
* 'INDIA_1km' : 1km dataset from NSIDC for the India's region.
* 'VN'        : 1km dataset from NSIDC for Vietnam.

In [14]:
sites.columns
filtered_sites = sites[sites['network'] == 'INDIA_1km'] # filter the sites by network
filtered_sites.reset_index(drop = True, inplace=True)
filtered_sites

Unnamed: 0,network,station,lat,lon,s_depth,e_depth
0,INDIA_1km,1,28.068695,77.117534,0,5
1,INDIA_1km,2,28.056521,77.117582,0,5
2,INDIA_1km,3,28.034862,77.122390,0,5
3,INDIA_1km,7,27.966415,77.112073,0,5
4,INDIA_1km,8,27.954784,77.127043,0,5
...,...,...,...,...,...,...
592,INDIA_1km,646,27.696755,77.610797,0,5
593,INDIA_1km,647,27.682648,77.621828,0,5
594,INDIA_1km,648,27.672665,77.615471,0,5
595,INDIA_1km,649,27.642475,77.622366,0,5


In [None]:
# Loop through each site to prepare the samples
for i in range(len(sites)):
    site = sites.loc[i]
    print(f"Processing for {i}: {site['station']}")
    # Create the path to save the samples
    path_2_site_file = os.path.join(dir_to_site_samples,'%s.csv'%(site['network']+'_'+str(site['station'])))

    # Check if the file already exists, if so, skip to the next site
    if os.path.exists(path_2_site_file):
        print(f"{path_2_site_file} is already done.")
        continue

    # Create the polygon grid covering the site in both EASE2.0 and WGS84
    ring_wgs,grid_ring=pobj.get_wgs_grid(site.lon,site.lat)
    polygon_grid=ee.Geometry.Polygon(ring_wgs, 'EPSG:4326', True, 20, False)

    # Extract the samples for the site
    samples,df_S1=utils_data_pre.samples_4_grid_v1(polygon_grid,START_DATE, END_DATE,START_DATE_NDVI,END_DATE_NDVI,ring_wgs,pobj, grid_size)
    if df_S1 is None or samples is None:
        print("Abort")
        continue

    # include the ground truth of soil moisture
    station_sm=pd.read_csv(os.path.join(dir_to_site_sm,'%s.csv'%(str(site['station']))),parse_dates=['time'])
    sm_point=station_sm[station_sm.time.dt.date.isin(list(df_S1.date.dt.date))]['sm']

    # Check if the length of sm_point matches the length of df_S1, we will get dates that are in df_S1
    if len(sm_point) != len(df_S1):
        print(f'Value and key do not have the same length, it is not a problem! {len(sm_point)} vs {len(df_S1)}')

    sm_df = pd.DataFrame({'date': station_sm.time.dt.date, 'sm': sm_point})
    sm_df['date'] = pd.to_datetime(sm_df['date'])
    # Merge df_S1 with the soil moisture data, we will keep the dates that are in df_S1
    df_S1 = df_S1.merge(sm_df, on = 'date', how = 'left')
    # df_S1.loc[df_S1.date.dt.date.isin(list(station_sm.time.dt.date)),'sm_25']=list(sm_point)

    # Concatenate the samples (NDVI, Temperature, Precipation. SoilGrids, DEM data) with df_S1 (Sentinel-1 data)
    try:
        samples=pd.DataFrame(samples,index=df_S1.index)
    except Exception as e:
        print(f"Error creating DataFrame: {e}")
        print(f"samples shape: {np.shape(samples)}")
        print(f"df_S1 shape: {df_S1.shape}")
        continue
    samples=pd.concat([df_S1,samples],axis=1)
    samples.dropna(inplace = True)
    samples.to_csv(path_2_site_file)
    # Sleep for a while to avoid GEE errors
    time.sleep(5)
    print("Done !!!")