# Satellite Image Exporter (Landsat via GEE)

This notebook downloads the satellite imagery used in the analysis from Google Earth Engine (GEE). For each DHS cluster, it retrieves a 6.72 × 6.72 km (224 × 224 px) image.

- Image composition: For each pixel, the median value across cloud-free observations within the three years prior to the survey is used.
- Sources: Imagery comes from Landsat 5, 7, and 8.
- Runtime: Execution typically takes ~2 hours, although runtime may vary depending on GEE server load.
- Output: The exported dataset is roughly 33 GB and is saved to the DATA_DIR specified in your config file.

If the run is interrupted, simply rerun the notebook. Existing files are skipped, so already-downloaded tiles won’t be re-exported.

In [3]:
import ee
import os
import requests
import multiprocessing
import pandas as pd
import numpy as np
from io import BytesIO
from pathlib import Path
from retry import retry
from GEE.landsat_exporter import LandsatExporter
import configparser

# Read config file
config = configparser.ConfigParser()
config.read('../config.ini')

# # Authenticate with Google account (uncomment to authenticate)
# ee.Authenticate()

# Initialize Google Earth Engine with the High-volume endpoint
# For more info, see https://developers.google.com/earth-engine/cloud/highvolume
ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')

In [None]:
DATA_DIR = config['PATHS']['DATA_DIR']
DOWNLOAD_DIR = os.path.join(DATA_DIR, 'dhs_images')
IMAGE_WIDTH = 6720
NUM_YEAR_SPAN = 3 # How many years before the survey should be used to get median image?
MASK_CLOUDS = True
DROP_FAULTY_L7 = True

df = pd.read_csv(os.path.join(DATA_DIR, 'dhs_data.csv'))
df.head()

Unnamed: 0,cluster_id,lon,lat,rural,region_id,country,survey,month,year,iwi
0,AO.Bengo.71.135,13.640789,-8.589805,False,AO.Bengo,Angola,Angola 2015-16 Standard DHS,11,2015,62.334459
1,AO.Bengo.71.158,14.122619,-7.718385,True,AO.Bengo,Angola,Angola 2015-16 Standard DHS,2,2016,8.226589
2,AO.Bengo.71.169,13.654425,-8.592545,False,AO.Bengo,Angola,Angola 2015-16 Standard DHS,10,2015,62.760211
3,AO.Bengo.71.203,13.517859,-8.65226,True,AO.Bengo,Angola,Angola 2015-16 Standard DHS,1,2016,68.211697
4,AO.Bengo.71.208,13.721998,-7.852511,True,AO.Bengo,Angola,Angola 2015-16 Standard DHS,11,2015,14.825944


In [None]:
def get_file_name_generator(download_dir, num_year_span, mask_clouds, drop_faulty_l7):
    
    num_year_part = str(num_year_span) + '_year'
    mask_clouds_part = 'masked_clouds' if mask_clouds else 'with_clouds'
    drop_l7_part = 'without_slc_error' if drop_faulty_l7 else 'with_slc_error'

    # Define the file name based on the parameters. Use landsat.np for default settings
    if num_year_span == 3 and mask_clouds and drop_faulty_l7:
        file_name = 'landsat.np'
    else:
        file_name = f'landsat_{num_year_part}_median_{mask_clouds_part}_{drop_l7_part}.np'
    
    get_file_name = lambda id: os.path.join(download_dir, id, file_name)
    return get_file_name

Export images

In [5]:
get_file_name = get_file_name_generator(DOWNLOAD_DIR, NUM_YEAR_SPAN, MASK_CLOUDS, DROP_FAULTY_L7)

@retry(tries=10, delay=1, backoff=2)
def export_row_loc(row):
    
    id = row['cluster_id']
    file_name = get_file_name(id)
    
    # Check if sample already downloaded
    if os.path.exists(file_name):
        return
    
    # Ensure download directory exists
    Path(os.path.join(DOWNLOAD_DIR, id)).mkdir(parents=True, exist_ok=True)
    
    loc = ee.Geometry.Point([row['lon'], row['lat']])
    
    # Define end_date (survey date) and start_date (three years before)
    year = int(row['year'])
    month = max(int(row['month']), 1) # Set surveys with month 0 to month 1

    end_date = f'{year:04d}-{month:02d}-01'

    start_year = year - NUM_YEAR_SPAN
    start_date = f'{start_year:04d}-{month:02d}-01'

    loc_collection = LandsatExporter.get_collection(loc, start_date, end_date)
    
    # Get the median pixel values
    dwnld_img = loc_collection.median()

    # Get a IMAGE_WIDTH x IMAGE_WIDTH meter frame around cluster coordinate
    loc_bbox = loc.buffer(IMAGE_WIDTH/2).bounds()
    
    try:
        # Try to get the download URL
        loc_download_url = dwnld_img.getDownloadURL({
            'name': id,
            'region': loc_bbox,
            'dimensions': [224, 224],
            'filePerBand': False,
            'format': 'NPY'
        })
    except ee.EEException as e:
        # Check if the error message is "Expression evaluates to an image with no bands"
        if "Expression evaluates to an image with no bands" in str(e):
            # Catch only the desired exception and print the error message
            print(f'Error for row {id}: {str(e)}')
            return
        else:
            # Let other exceptions pass through
            print(f'Error for row {id}: {str(e)}')
            raise e
    
    r = requests.get(loc_download_url) # send get request
    
    if r.status_code == 200:  # HTTP GET: 200 OK
        
        # Load bytes as numpy array (bands as np.void)
        img = np.load(BytesIO(r.content))
        
        # Convert to a standard 3D numpy array (width, height, channels)
        img = np.stack([img[field] for field in img.dtype.names], axis=-1)
        
        # Save retrieved image
        with open(file_name, 'wb') as out_file:
              np.save(out_file, img)
    # retry, get request failed
    else:
        print(f'{r.status_code}: {r.reason}')
        raise HTTPException(status_code=r.status_code, detail=r.reason)

Some places evaluates to "an image with no bands". This means that not a single cloud-free image was available in the three years leading up to the survey.

In [6]:
# Get samples as list, since multiprocessing doesn't work with dataframes
clusters = [row for _, row in df.iterrows()]

n_workers = 40
pool = multiprocessing.Pool(n_workers)
pool.map(export_row_loc, clusters)
pool.close()
pool.join()

Error for row CM.Adamaoua.22.122: Expression evaluates to an image with no bands.
Error for row CM.Adamaoua.22.123: Expression evaluates to an image with no bands.
Error for row CM.Adamaoua.22.124: Expression evaluates to an image with no bands.
Error for row CM.Adamaoua.22.125: Expression evaluates to an image with no bands.
Error for row CM.Adamaoua.22.126: Expression evaluates to an image with no bands.
Error for row CM.Adamaoua.22.127: Expression evaluates to an image with no bands.
Error for row CM.Adamaoua.22.128: Expression evaluates to an image with no bands.
Error for row CM.Adamaoua.22.13: Expression evaluates to an image with no bands.
Error for row CM.Adamaoua.22.14: Expression evaluates to an image with no bands.
Error for row CM.Adamaoua.22.15: Expression evaluates to an image with no bands.
Error for row CM.Adamaoua.22.16: Expression evaluates to an image with no bands.
Error for row CM.Adamaoua.22.17: Expression evaluates to an image with no bands.
Error for row CM.Adam