# Identify and rerun failed Sentinel-2 jobs

Some Sentinel-2 images are inaccessible from GEE. When jobs are submitted with monthly time ranges, for example, one inaccesible image can lead to an entire month of lost observations. 

The goals of this notebook therefore are to:
- Load your RGI glaciers subset
- Load compiled time series output from `compile_CSVs.ipynb` and downloaded locally

Then, for each glacier:
- Identify time ranges with no Sentinel-2 observations
- Rerun the pipeline for the missing Sentinel-2 months with daily time ranges.

I recommend NOT downloading any new files locally until each batch of failed jobs has been successfully resubmitted to keep indexing, etc. consistent. 

In [1]:
import os
import ee 
import sys
from glob import glob
import pandas as pd
from tqdm import tqdm

# -----Define path in local directory to compiled time series
ts_folder = '/Users/rdcrlrka/Research/glacier_snow_mapping/glacier_snow_cover_exports_compiled'

# -----Define Google Drive folder for exports
# NOTE: Make sure this folder already exists and is the only folder with that name in "My Drive". 
out_folder = 'glacier_snow_cover_exports'

# -----Import pipeline utilities
# Assumes pipeline_utils.py is in the same folder as this notebook
script_path = os.getcwd()
sys.path.append(script_path)
import glasee_pipeline_utils as utils

# -----Define image search settings
# Date and month ranges (inclusive)
date_start = '2014-04-01' 
date_end = '2025-09-15' 
month_start = 4 # April = 4
month_end = 10 # Oct = 10
# Minimum fill portion percentage of the AOI (0–100), used to remove images after mosaicking by day
min_aoi_coverage = 70
# Whether to mask clouds using the respective cloud mask via the geedim package
mask_clouds = True

## Authenticate and/or Initialize Google Earth Engine (GEE)

In [2]:
project_id = "ee-raineyaberle"

try:
    ee.Initialize(project=project_id)
except:
    ee.Authenticate()
    ee.Initialize(project=project_id)

## Load the RGI glaciers subset and compiled time series file names

In [3]:
# Load the RGI v. 7 dataset
rgi = ee.FeatureCollection("projects/ee-raineyaberle/assets/glacier-snow-cover-mapping/RGI2000-v7-G")
# Apply filters
rgi_filt = (rgi
            .filter(ee.Filter.eq('o1region', '02')) # Western U.S. & Canada
            .filter(ee.Filter.gte('area_km2', 10)) # area > 10 km2
            .filter(ee.Filter.lte('area_km2', 200)) # area < 200 km2
            )
# Get the list of RGI IDs
id_list = rgi_filt.aggregate_array('rgi_id')
id_list = id_list.getInfo()
print('Number of glaciers selected from the RGI:', len(id_list))

# Load compiled time series file names
ts_files = sorted(glob(os.path.join(ts_folder, '*.csv')))
ts_id_list = [os.path.basename(x).split('_')[0] for x in ts_files]
print("Number of glaciers with compiled time series:", len(ts_id_list))

# Identify glaciers in the RGI subset without compiled time series
missing_id_list = [x for x in id_list if x not in ts_id_list]
print('Number of glaciers in the RGI subset without compiled time series:', len(missing_id_list))
if missing_id_list:
    print(missing_id_list)

Number of glaciers selected from the RGI: 149
Number of glaciers with compiled time series: 144
Number of glaciers in the RGI subset without compiled time series: 5
['RGI2000-v7.0-G-02-05381', 'RGI2000-v7.0-G-02-10331', 'RGI2000-v7.0-G-02-10336', 'RGI2000-v7.0-G-02-10357', 'RGI2000-v7.0-G-02-10372']


## Identify months with no observations for each glacier

In [4]:
# Intialize dictionary to hold results
missing_dict = {}

# Iterate over study sites
for rgi_id in tqdm(ts_id_list):
    # open the compiled time series
    ts_file = [x for x in ts_files if rgi_id in os.path.basename(x)][0]
    ts = pd.read_csv(ts_file)
    ts['date'] = pd.to_datetime(ts['date'])

    # identify missing Sentinel-2_SR months
    ts_s2_sr = ts.loc[ts['source']=="Sentinel-2_SR"]
    all_months = pd.date_range(start="2019-01-01", end=date_end, freq='MS')
    all_months = all_months[(all_months.month >= month_start) & (all_months.month <= month_end)]
    ts_months = ts_s2_sr['date'].dt.to_period('M').drop_duplicates().dt.to_timestamp()
    missing_months_s2_sr = all_months.difference(ts_months)

    # identify missing Sentinel-2_TOA months
    ts_s2_toa = ts.loc[ts['source']=="Sentinel-2_TOA"]
    all_months = pd.date_range(start="2016-05-01", end=date_end, freq='MS')
    all_months = all_months[(all_months.month >= month_start) & (all_months.month <= month_end)]
    ts_months = ts_s2_toa['date'].dt.to_period('M').drop_duplicates().dt.to_timestamp()
    missing_months_s2_toa = all_months.difference(ts_months)
    
    # add to the dictionary
    missing_dict[rgi_id] = {
        'Sentinel-2_SR': [x.strftime('%Y-%m') for x in missing_months_s2_sr],
        'Sentinel-2_TOA': [x.strftime('%Y-%m') for x in missing_months_s2_toa],
    }

total_missing_months = sum([len(missing_dict[x]['Sentinel-2_SR']) + len(missing_dict[x]['Sentinel-2_TOA']) for x in missing_dict])

print('Glaciers with missing months:', len(missing_dict.keys()))
print('Total number of missing months across all glaciers:', total_missing_months)


100%|████████████████████████████████████████| 144/144 [00:00<00:00, 289.02it/s]

Glaciers with missing months: 144
Total number of missing months across all glaciers: 1339





## Rerun failed jobs with daily time ranges

In [5]:
# Helper function
def run_daily_jobs(dataset, missing_months, rgi_id, aoi, dem, out_folder):
    # determine scale
    scale = 10 if ('Sentinel-2' in dataset) else 30
    
    # iterate over months
    for month in missing_months:
        print(month)
        
        # create daily time ranges
        days = pd.date_range(
            month + '-01', 
            f"{month[0:4]}-{int(month[-2:])+1}-01"
        )
        days = [str(x)[0:10] for x in days]
        day_ranges = list(zip(days[0:-1], days[1:]))
        
        # iterate over day ranges
        for day_range in tqdm(day_ranges):
            # Query GEE for imagery
            image_collection = utils.query_gee_for_imagery(
                dataset, aoi, day_range[0], day_range[1], month_start, month_end, 
                min_aoi_coverage, mask_clouds, scale, verbose=False
            )
        
            # Classify image collection
            classified_collection = utils.classify_image_collection(image_collection, dataset, verbose=False)
    
            # Calculate snow cover statistics, export to Google Drive
            _ = utils.calculate_snow_cover_statistics(
                classified_collection, dem, aoi, dataset, scale, out_folder,
                file_name_prefix=f"{rgi_id}_{dataset}_snow_cover_stats_{day_range[0]}_{day_range[1]}", verbose=False
            )
    return
        

In [24]:
for rgi_id in list(missing_dict.keys())[147:]:
    print(f'\n{rgi_id}')
    print('----------------')
    # Get the AOI
    aoi = rgi.filter(ee.Filter.eq('rgi_id', rgi_id)).geometry()

    # Query GEE for DEM
    dem = utils.query_gee_for_dem(aoi) 
    
    # Sentinel-2_SR
    # print('\nSentinel-2_SR')
    # missing_months_s2_sr = missing_dict[rgi_id]['Sentinel-2_SR']
    # run_daily_jobs("Sentinel-2_SR", missing_months_s2_sr, rgi_id, aoi, dem, out_folder)
        
    # Sentinel-2_TOA
    print('\nSentinel-2_TOA')
    missing_months_s2_toa = missing_dict[rgi_id]['Sentinel-2_TOA']
    run_daily_jobs("Sentinel-2_TOA", missing_months_s2_toa, rgi_id, aoi, dem, out_folder)
    

In [25]:
# Check job queue statuses
tasks = ee.batch.Task.list()
running_count = len([x for x in tasks if x.state=='RUNNING'])
ready_count = len([x for x in tasks if x.state=='READY'])
completed_count = len([x for x in tasks if x.state=='COMPLETED'])
failed_count = len([x for x in tasks if x.state=='FAILED'])

print('Total number of tasks =', len(tasks))
print('RUNNING tasks =', running_count)
print('READY tasks =', ready_count)
print('COMPLETED tasks =', completed_count)
print('FAILED tasks =', failed_count)

Total number of tasks = 18831
RUNNING tasks = 0
READY tasks = 1443
COMPLETED tasks = 17326
FAILED tasks = 62


In [57]:
# # Job limit exceeded, whoops
# rgi_id = 'RGI2000-v7.0-G-02-08338'
# dataset = 'Sentinel-2_TOA'
# # create daily time ranges
# days = pd.date_range('2020-10-19', '2020-11-01')
# days = [str(x)[0:10] for x in days]
# day_ranges = list(zip(days[0:-1], days[1:]))
# scale = 10

#  # iterate over day ranges
# for day_range in tqdm(day_ranges):
#     # Query GEE for imagery
#     image_collection = utils.query_gee_for_imagery(
#         dataset, aoi, day_range[0], day_range[1], month_start, month_end, 
#         min_aoi_coverage, mask_clouds, scale, verbose=False
#     )

#     # Classify image collection
#     classified_collection = utils.classify_image_collection(image_collection, dataset, verbose=False)

#     # Calculate snow cover statistics, export to Google Drive
#     _ = utils.calculate_snow_cover_statistics(
#         classified_collection, dem, aoi, dataset, scale, out_folder,
#         file_name_prefix=f"{rgi_id}_{dataset}_snow_cover_stats_{day_range[0]}_{day_range[1]}", verbose=False
#     )