# Download Satellite Images

This file download satellite images. You need access to google earth engine and it should be executed in Google Colab, for best performance. However you can find a Google Colab optimized version called `0.1_download_satellite_colab.ipnb`. The processing can take up to several hours. Depending on the load on Googles Server. Export is based on this [work](https://github.com/sustainlab-group/africa_poverty) ([MIT License](https://github.com/sustainlab-group/africa_poverty/commit/653ed5a60bb503d073996753c4f89d45ec480c59)) and modified for our needs.

In [None]:
cd ..

In [None]:
import ee
import math
import pandas as pd

from lib import satellite_utils
from __future__ import annotations
from typing import Optional

%load_ext autoreload
%autoreload 2

In [None]:
ee.Authenticate()

In [None]:
ee.Initialize()

In [None]:
EXPORT = 'drive'
BUCKET = None

LSMS_EXPORT_FOLDER = 'tfrecords_raw' # defined where to store in your drive

In [None]:
LSMS_CSV_PATH = '../data/lsms/processed/_all_nominal.csv'

# band names
MS_BANDS = ['BLUE', 'GREEN', 'RED', 'NIR', 'SWIR1', 'SWIR2', 'TEMP1']

# image parameters
PROJECTION = 'EPSG:3857'  # see https://epsg.io/3857
SCALE = 30                # export resolution: 30m/px
EXPORT_TILE_RADIUS = 127  # image dimension = (2*EXPORT_TILE_RADIUS) + 1 = 255px

CHUNK_SIZE = None

In [None]:
def export_images(df: pd.DataFrame,
                  country: str,
                  year: int,
                  export_folder: str,
                  chunk_size: Optional[int] = None
                  ) -> dict[tuple[str, str, int, int], ee.batch.Task]:
    '''
    Args
    - df: pd.DataFrame, contains columns ['lat', 'lon', 'country', 'year']
    - country: str, together with `year` determines the survey to export
    - year: int, together with `country` determines the survey to export
    - export_folder: str, name of folder for export
    - chunk_size: int, optionally set a limit to the # of images exported per TFRecord file
        - set to a small number (<= 50) if Google Earth Engine reports memory errors

    Returns: dict, maps task name tuple (export_folder, country, year, chunk) to ee.batch.Task
    '''
    subset_df = df[(df['country'] == country) & (df['year'] == year)].reset_index(drop=True)
    if chunk_size is None:
        chunk_size = len(subset_df)
    num_chunks = int(math.ceil(len(subset_df) / chunk_size))
    tasks = {}

    for i in range(num_chunks):
        chunk_slice = slice(i * chunk_size, (i+1) * chunk_size - 1)  # df.loc[] is inclusive
        fc = satellite_utils.df_to_fc(subset_df.loc[chunk_slice, :])
        start_date, end_date = f'{year}-01-01', f'{year}-12-31'
        
        roi = fc.geometry()
        imgcol = satellite_utils.LandsatSR(roi, start_date=start_date, end_date=end_date).merged
        imgcol = imgcol.map(satellite_utils.mask_qaclear).select(MS_BANDS)
        img = imgcol.median()

        # add nightlights, latitude, and longitude bands
        img = satellite_utils.add_latlon(img)
        img = img.addBands(satellite_utils.composite_nl(year))

        fname = f'{country}_{year}_{i:02d}'
        tasks[(export_folder, country, year, i)] = satellite_utils.get_array_patches(
            img=img, scale=SCALE, ksize=EXPORT_TILE_RADIUS,
            points=fc, export=EXPORT,
            prefix=export_folder, fname=fname,
            bucket=BUCKET)
    return tasks

In [None]:
tasks: dict[tuple[str, str, int, int], ee.batch.Task] = {}

In [None]:
lsms_df = pd.read_csv(LSMS_CSV_PATH, float_precision='high', index_col=False)

In [None]:
lsms_surveys = list(lsms_df.groupby(['country', 'year']).groups.keys())
for country, year in lsms_surveys:
    new_tasks = export_images(
        df=lsms_df, country=country, year=year,
        export_folder=LSMS_EXPORT_FOLDER, chunk_size=CHUNK_SIZE)
    tasks.update(new_tasks)