## Prepare samples using GEE

### Setup
For each site, extract the full time series of Sentinel-1,NDVI data from the GEE. Note: if the output csv files already exist they are assumed to be correct and are not over-written.

Note: Proxy was set for the well known reason in China and you may not need it. Also check the proxy in the utils.py

In [1]:
import os
import pandas as pd
import numpy as np
import time
import ee
import utils_data_pre_HR

In [7]:
# ee.Authenticate(force = True) # authenticate the gee account
# ee.Initialize()

Set the parameters, paths etc.

In [2]:
HOME_DATA_DIR = '/mnt/data2tb/Transfer-DenseSM-E_2/1km_global_data/india' # change the dir
# Date range for Sentinel-1 data
START_DATE = "2021-01-01" #"2016-01-01" 
END_DATE =  "2022-12-31"  #"2023-11-30"  
# Date range for NDVI and weather data,one year preceding the Sentinel-1
START_DATE_NDVI = "2019-12-26" #"2019-12-26"     
END_DATE_NDVI = "2023-01-08"  #"2023-01-06" 
#Global setups, dir, path
save_to_disk = False # No temporal files
SM_SITES = os.path.join(HOME_DATA_DIR, "points/tree_grass_crops_site_info.csv") # the site informaiton extracted by Preprocessing_ISMN_Raw_Data.ipynb
dir_to_site_sm = os.path.join(HOME_DATA_DIR, "tree_grass_crops_csv") # the path to the soil moisture of stations
dir_to_site_samples = os.path.join(HOME_DATA_DIR, "ndvi/ndvi_output")
os.makedirs(dir_to_site_samples, exist_ok= True)

Read the sites information and determine the grid size

In [3]:
sites = pd.read_csv(SM_SITES, float_precision="high")
# sites = pd.read_csv('data_pre/stations.csv', float_precision="high")

# grid_size = 0.05 # km
grid_size = 1.0 #km
pobj=utils_data_pre_HR.grids_4_a_region(4326,grid_size) # determine the grid size

## A loop to prepare the input data of each site

###### 1 Create the gird polygon covering a site in both EASE2.0 and WGS84
###### 2 Extract Sentienl-1, soil texture, terrain, NDVI, precipition, temperature etc. Check the utils for the details
###### 3 Concatenate all data
###### 4 Extract the ground soil moisture of the site
###### Note: the loop may report the error "IncompleteRead", just run this cell again.

In [5]:
sites.columns
filtered_sites = sites[sites['network'] == 'INDIA_1km'] # filter the sites by network
filtered_sites.reset_index(drop = True, inplace=True)
filtered_sites

Unnamed: 0,network,station,lat,lon,s_depth,e_depth
0,INDIA_1km,1,28.068695,77.117534,0,5
1,INDIA_1km,2,28.056521,77.117582,0,5
2,INDIA_1km,3,28.034862,77.122390,0,5
3,INDIA_1km,7,27.966415,77.112073,0,5
4,INDIA_1km,8,27.954784,77.127043,0,5
...,...,...,...,...,...,...
592,INDIA_1km,646,27.696755,77.610797,0,5
593,INDIA_1km,647,27.682648,77.621828,0,5
594,INDIA_1km,648,27.672665,77.615471,0,5
595,INDIA_1km,649,27.642475,77.622366,0,5


In [6]:
for i in range(len(sites)):#len(sites)
    site = sites.loc[i]
    print(f"Processing for {i}: {site['station']}")
    path_2_site_file = os.path.join(dir_to_site_samples,'%s.csv'%(site['network']+'_'+str(site['station'])))
    if os.path.exists(path_2_site_file):
        print(f"{path_2_site_file} is already done.")
        continue
    # if os.path.exists(path_2_site_file):
    #     continue
    ring_wgs,grid_ring=pobj.get_wgs_grid(site.lon,site.lat)
    polygon_grid=ee.Geometry.Polygon(ring_wgs, 'EPSG:4326', True, 20, False)
    _,_, df_NDVI,_=utils_data_pre_HR.prepare_grid_data_v1(polygon_grid,START_DATE, END_DATE,START_DATE_NDVI,END_DATE_NDVI,ring_wgs,pobj, path_2_site_file)
    time.sleep(2)
    print("Done !!!")

Processing for 0: 1
Number of images: 227
Done !!!
Processing for 1: 2
Number of images: 218
Done !!!
Processing for 2: 3
Number of images: 204
Done !!!
Processing for 3: 7
Number of images: 166
Done !!!
Processing for 4: 8
Number of images: 104
Done !!!
Processing for 5: 9
Number of images: 139
Done !!!
Processing for 6: 10
Number of images: 112
Done !!!
Processing for 7: 11
Number of images: 86
Done !!!
Processing for 8: 12
Number of images: 89
Done !!!
Processing for 9: 13
Number of images: 86
Done !!!
Processing for 10: 14
Number of images: 86
Done !!!
Processing for 11: 15
Number of images: 86
Done !!!
Processing for 12: 16
Number of images: 86
Done !!!
Processing for 13: 17
Number of images: 86
Done !!!
Processing for 14: 18
Number of images: 86
Done !!!
Processing for 15: 19
Number of images: 86
Done !!!
Processing for 16: 20
Number of images: 86
Done !!!
Processing for 17: 21
Number of images: 86
Done !!!
Processing for 18: 22
Number of images: 86
Done !!!
Processing for 19: 23

### Extract input data for inference

In [None]:
# Select rectangle 2 because SMs ossilates in this area
csv_path = '/mnt/data2tb/Transfer-DenseSM-E/India/visualize/visualization_swc_values_2.csv'
sites = pd.read_csv(csv_path, float_precision="high")

batch_size = 50
delay = 30 

# Split sites into batches
batches = [sites[i:i + batch_size] for i in range(0, len(sites), batch_size)]
if len(sites) % batch_size != 0:
    batches.append(sites[len(sites) - len(sites) % batch_size:])

all_samples = []

for batch_id, batch in enumerate(batches[:30]):
    print(f"Processing batch {batch_id + 1}/{len(batches)} with {len(batch)} sites")
    for i, row in batch.iterrows():
        print(f"Processing for {i}: {row['location']}")
        ring_wgs, grid_ring = pobj.get_wgs_grid(row['lon'], row['lat'])
        polygon_grid = ee.Geometry.Polygon(ring_wgs, 'EPSG:4326', True, 20, False)
        samples, df_S1 = utils_data_pre.samples_4_grid_v1(polygon_grid, START_DATE, END_DATE, START_DATE_NDVI, END_DATE_NDVI, ring_wgs, pobj, grid_size)

        df_S1['date'] = pd.to_datetime(df_S1['date']).dt.strftime('%Y-%m-%d')

        station_sm = row['sm_25']
        print(station_sm)
        

        single_date = pd.to_datetime(row['date']).strftime('%Y-%m-%d')

        df_S1 = df_S1[df_S1['date'] == single_date]
        
        samples = pd.DataFrame(samples)
        samples = samples.loc[df_S1.index]

        df_S1 = df_S1.copy()
        df_S1['sm_25'] = row['sm_25']

        result = pd.concat([df_S1.reset_index(drop=True), samples.reset_index(drop=True)], axis = 1)
        result.insert(0, 'location', row['location'])


        result.dropna(inplace=True)
        result.to_csv('result.csv', index = False)
        all_samples.append(result)

    if batch_id < len(batches) - 1:
        print(f"Sleeping for {delay} seconds...")
        time.sleep(delay)

final_df = pd.concat(all_samples, ignore_index=True)
final_df.dropna(inplace=True)
final_df.to_csv('final_result.csv', index=False)
print(final_df.head())