<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Prerequisites" data-toc-modified-id="Prerequisites-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Prerequisites</a></span></li><li><span><a href="#Imports-and-Constants" data-toc-modified-id="Imports-and-Constants-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports and Constants</a></span><ul class="toc-item"><li><span><a href="#Prepare-the-dataset" data-toc-modified-id="Prepare-the-dataset-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Prepare the dataset</a></span></li><li><span><a href="#Mask-out-cloud,-snow,-and-cloud-shadow" data-toc-modified-id="Mask-out-cloud,-snow,-and-cloud-shadow-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Mask out cloud, snow, and cloud shadow</a></span></li><li><span><a href="#Multi-yr-composite" data-toc-modified-id="Multi-yr-composite-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Multi-yr composite</a></span></li><li><span><a href="#Add-nightlight" data-toc-modified-id="Add-nightlight-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Add nightlight</a></span></li><li><span><a href="#Add-topography" data-toc-modified-id="Add-topography-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Add topography</a></span></li><li><span><a href="#Export-TF-Records" data-toc-modified-id="Export-TF-Records-2.6"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>Export TF Records</a></span></li></ul></li></ul></div>

# Prerequisites

1. Register a Gmail email address at [https://code.earthengine.google.com](https://code.earthengine.google.come). This process may take a couple of days. Without registration, the `ee.Initialize()` command below will throw an error message.
2. Within your conda environment, run `earthengine activate` and follow the prompt. For more instructions, see [https://developers.google.com/earth-engine/python_install-conda.html](https://developers.google.com/earth-engine/python_install-conda.html).

## Instructions

This file must be run twice: once for DHS and once for LSMS.  Adjust the parameters below based on your personal Google Cloud Platform account.  Keep in mind that CSV_INPUT_PATH, GCS_FILE_PREFIX, and IS_DHS will require different values when exporting DHS data than when exporting LSMS data.

## Adjust Parameters

In [1]:
CSV_INPUT_PATH = '../data/dhs_wealth_index.csv'  # The path to the survey CSV, located in this repo
DHS_ASSET_ID = 'projects/ec985-thesis/assets/dhs_wealth_index'  # The survey ID in your GEE account
GCS_BUCKET = 'dhs-imgs'  # A GCS bucket you own that will house data
GCS_FILE_PREFIX = 'dhs_tfrecords_raw2/' # Will prefix file names, it is recommended that you use different folders for DHS and LSMS.
IS_DHS = True # False for LSMS

In [2]:
if IS_DHS:
    file_suffix = '_dhslocs_'
else:
    file_suffix = '_lsmslocs_'

# Imports and Constants

In [3]:
import ee

from ee_assets import upload_geojson_to_gee, asset_exists
import ee_utils
import pandas as pd
import numpy as np
import math

In [4]:
ee.Initialize()

## Prepare the dataset

In [5]:
# This will upload the DHS CSV to your Google Earth Engine account
if not asset_exists(DHS_ASSET_ID):
    upload_geojson_to_gee(CSV_INPUT_PATH, DHS_ASSET_ID)

In [6]:
dhs = ee.FeatureCollection(DHS_ASSET_ID)

In [7]:
dhs_oldest = dhs.filter([ee.Filter.gt('year', 2008), ee.Filter.lte('year', 2011)])  # [2009-2011] inclusive
dhs_middle = dhs.filter([ee.Filter.gt('year', 2011), ee.Filter.lte('year', 2014)])  # [2012-2014] inclusive
dhs_recent = dhs.filter(ee.Filter.gt('year', 2014))  # [2015-onwards]

In [8]:
print('Oldest size:', dhs_oldest.size().getInfo())
print('Middle size:', dhs_middle.size().getInfo())
print('Recent size:', dhs_recent.size().getInfo())

Oldest size: 8257
Middle size: 9628
Recent size: 5066


In [9]:
countries = dhs.distinct('country').aggregate_array('country').getInfo()
display(countries)

['Angola',
 'Burkina Faso',
 'Benin',
 'Democratic Republic of Congo',
 'Central African Republic',
 'Congo',
 "Cote d'Ivoire",
 'Cameroon',
 'Ethiopia',
 'Gabon',
 'Ghana',
 'Guinea',
 'Kenya',
 'Comoros',
 'Lesotho',
 'Mali',
 'Malawi',
 'Mozambique',
 'Nigeria',
 'Rwanda',
 'Sierra Leone',
 'Senegal',
 'Swaziland',
 'Chad',
 'Togo',
 'Tanzania',
 'Uganda',
 'South Africa',
 'Zambia',
 'Zimbabwe']

In [10]:
countries = dhs_oldest.distinct('country').aggregate_array('country').getInfo()
for i in countries:
    df = dhs_oldest.filter(ee.Filter.eq('country', i))
    fname = 'lx_median_2009-11_'+i+'_dhslocs'
    print(fname)
    print(df.size().getInfo())

lx_median_2009-11_Angola_dhslocs
238
lx_median_2009-11_Burkina Faso_dhslocs
573
lx_median_2009-11_Benin_dhslocs
81
lx_median_2009-11_Congo_dhslocs
572
lx_median_2009-11_Cote d'Ivoire_dhslocs
45
lx_median_2009-11_Cameroon_dhslocs
578
lx_median_2009-11_Ethiopia_dhslocs
596
lx_median_2009-11_Kenya_dhslocs
216
lx_median_2009-11_Lesotho_dhslocs
400
lx_median_2009-11_Malawi_dhslocs
849
lx_median_2009-11_Mozambique_dhslocs
880
lx_median_2009-11_Nigeria_dhslocs
239
lx_median_2009-11_Rwanda_dhslocs
492
lx_median_2009-11_Senegal_dhslocs
513
lx_median_2009-11_Tanzania_dhslocs
535
lx_median_2009-11_Uganda_dhslocs
1044
lx_median_2009-11_Zimbabwe_dhslocs
406


In [11]:
countries = dhs_middle.distinct('country').aggregate_array('country').getInfo()
for i in countries:
    df = dhs_middle.filter(ee.Filter.eq('country', i))
    fname = 'lx_median_2012-14_'+i+'_dhslocs'
    print(fname)
    print(df.size().getInfo())

lx_median_2012-14_Burkina Faso_dhslocs
252
lx_median_2012-14_Benin_dhslocs
669
lx_median_2012-14_Democratic Republic of Congo_dhslocs
536
lx_median_2012-14_Congo_dhslocs
52
lx_median_2012-14_Cote d'Ivoire_dhslocs
306
lx_median_2012-14_Gabon_dhslocs
336
lx_median_2012-14_Ghana_dhslocs
427
lx_median_2012-14_Guinea_dhslocs
300
lx_median_2012-14_Kenya_dhslocs
1594
lx_median_2012-14_Comoros_dhslocs
252
lx_median_2012-14_Lesotho_dhslocs
399
lx_median_2012-14_Mali_dhslocs
413
lx_median_2012-14_Malawi_dhslocs
280
lx_median_2012-14_Nigeria_dhslocs
896
lx_median_2012-14_Rwanda_dhslocs
342
lx_median_2012-14_Sierra Leone_dhslocs
435
lx_median_2012-14_Senegal_dhslocs
200
lx_median_2012-14_Chad_dhslocs
230
lx_median_2012-14_Togo_dhslocs
330
lx_median_2012-14_Tanzania_dhslocs
523
lx_median_2012-14_Uganda_dhslocs
135
lx_median_2012-14_Zambia_dhslocs
721


In [12]:
countries = dhs_recent.distinct('country').aggregate_array('country').getInfo()
for i in countries:
    df = dhs_recent.filter(ee.Filter.eq('country', i))
    fname = 'lx_median_2015-17_'+i+'_dhslocs'
    print(fname)
    print(df.size().getInfo())

lx_median_2015-17_Angola_dhslocs
625
lx_median_2015-17_Ethiopia_dhslocs
643
lx_median_2015-17_Ghana_dhslocs
200
lx_median_2015-17_Kenya_dhslocs
245
lx_median_2015-17_Mali_dhslocs
177
lx_median_2015-17_Malawi_dhslocs
850
lx_median_2015-17_Nigeria_dhslocs
326
lx_median_2015-17_Rwanda_dhslocs
309
lx_median_2015-17_Senegal_dhslocs
214
lx_median_2015-17_Chad_dhslocs
394
lx_median_2015-17_Tanzania_dhslocs
608
lx_median_2015-17_Uganda_dhslocs
75
lx_median_2015-17_Zimbabwe_dhslocs
400


## Multi-yr composite

In [13]:
selbands = ['BLUE', 'GREEN', 'RED', 'NIR', 'SWIR1', 'SWIR2', 'TEMP1']

In [14]:
roi_recent = dhs_recent.geometry()
srcoll_recent = ee_utils.LandsatSR(roi_recent, '2015-1-1', '2017-12-31').merged
srcoll_recent = srcoll_recent.map(ee_utils.mask_qaclear).select(selbands)
srmedian_recent = srcoll_recent.median()
srmedian_recent = ee_utils.add_latlon(srmedian_recent)

In [15]:
roi_middle = dhs_middle.geometry()
srcoll_middle = ee_utils.LandsatSR(roi_middle, '2012-1-1', '2014-12-31').merged
srcoll_middle = srcoll_middle.map(ee_utils.mask_qaclear).select(selbands)
srmedian_middle = srcoll_middle.median()
srmedian_middle = ee_utils.add_latlon(srmedian_middle)

In [16]:
roi_oldest = dhs_oldest.geometry()
srcoll_oldest = ee_utils.LandsatSR(roi_oldest, '2009-1-1', '2011-12-31').merged
srcoll_oldest = srcoll_oldest.map(ee_utils.mask_qaclear).select(selbands)
srmedian_oldest = srcoll_oldest.median()
srmedian_oldest = ee_utils.add_latlon(srmedian_oldest)

## Add nightlight

In [17]:
# viirs = ee.ImageCollection("NOAA/VIIRS/DNB/MONTHLY_V1/VCMSLCFG")
# dmsp = ee.ImageCollection("NOAA/DMSP-OLS/CALIBRATED_LIGHTS_V4")

In [18]:
# nlband = ['NIGHTLIGHTS']
# viirs_recent = viirs.filterDate('2015-1-1', '2017-12-31').median().select([0],nlband)
# viirs_mid = viirs.filterDate('2012-1-1', '2014-12-31').median().select([0],nlband)
# dmsp_oldest = dmsp.filterDate('2009-1-1', '2011-12-31').median().select([0],nlband)

In [19]:
# srmedian_recent = srmedian_recent.addBands(viirs_recent.reproject('EPSG:3857', None, 30))
# srmedian_middle = srmedian_middle.addBands(viirs_mid.reproject('EPSG:3857', None, 30))
# srmedian_oldest = srmedian_oldest.addBands(dmsp_oldest.reproject('EPSG:3857', None, 30))

In [20]:
srmedian_recent = srmedian_recent.addBands(ee_utils.composite_nl(2015))
srmedian_middle = srmedian_middle.addBands(ee_utils.composite_nl(2012))
srmedian_oldest = srmedian_oldest.addBands(ee_utils.composite_nl(2009))

## Add topography

In [21]:
dem = ee.Image("USGS/SRTMGL1_003")

In [22]:
tbands = ['ELEV','SLO', 'ASP']
topogr = ee.Algorithms.Terrain(dem).select(['elevation', 'slope', 'aspect'], tbands)

In [23]:
srmedian_recent = srmedian_recent.addBands(topogr)
srmedian_middle = srmedian_middle.addBands(topogr)
srmedian_oldest = srmedian_oldest.addBands(topogr)

## Export TF Records

In [35]:
# band names
MS_BANDS = ['BLUE', 'GREEN', 'RED', 'NIR', 'SWIR1', 'SWIR2', 'TEMP1']

# image export parameters
PROJECTION = 'EPSG:3857'  # see https://epsg.io/3857
SCALE = 30                # export resolution: 30m/px
EXPORT_TILE_RADIUS = 127  # image dimension = (2*EXPORT_TILE_RADIUS) + 1 = 255px

def export_images(
        df: pd.DataFrame,
        country: str,
        year: int,
        export_folder: str,
        fname : str):
    '''
    Args
    - df: pd.DataFrame, contains columns ['lat', 'lon', 'country', 'year']
    - country: str, together with `year` determines the survey to export
    - year: int, together with `country` determines the survey to export
    - export_folder: str, name of folder for export
    - chunk_size: int, optionally set a limit to the # of images exported per TFRecord file
        - set to a small number (<= 50) if Google Earth Engine reports memory errors

    Returns: dict, maps task name tuple (export_folder, country, year, chunk) to ee.batch.Task
    '''
    subset_df = df[(df['country'] == country) & (df['year'] == year)].reset_index(drop=True)
    tasks = {}

    start_date, end_date = ee_utils.surveyyear_to_range(year)
    fc = ee_utils.df_to_fc(subset_df, lat_colname='LATNUM', lon_colname='LONGNUM')
    
    # create 3-year Landsat composite image
    roi_recent = fc.geometry()
    srcoll_recent = ee_utils.LandsatSR(roi_recent, start_date=start_date, end_date=end_date).merged
    srcoll_recent = srcoll_recent.map(ee_utils.mask_qaclear).select(MS_BANDS)
    srmedian_recent = srcoll_recent.median()

    # add nightlights, latitude, and longitude bands
    srmedian_recent = ee_utils.add_latlon(srmedian_recent)
    srmedian_recent = srmedian_recent.addBands(ee_utils.composite_nl(2015))

    tasks[(export_folder, country, year, i)] = ee_utils.get_array_patches(
        img=srmedian_recent, scale=SCALE, ksize=EXPORT_TILE_RADIUS,
        points=fc, export='gcs',
        prefix=export_folder, fname=fname,
        bucket=GCS_BUCKET)
    
    return tasks

In [36]:
dhs_df = pd.read_csv(CSV_INPUT_PATH, float_precision='high', index_col=False)
tasks = {}
countries = dhs_recent.distinct('country').aggregate_array('country').getInfo()
for i in countries[3:4]:
    fname = 'lx_median_2015-17_'+i+file_suffix
    new_tasks = export_images(
        df=dhs_df, country=i, year=2015,
        export_folder=GCS_FILE_PREFIX, fname=fname)
    tasks.update(new_tasks)

   cluster   svyid  wealthpooled  wealthpooled5country    wealth iso3 hv000  \
0        1  KE2015      1.200159              0.916755  0.660712  KEN   KE6   
1        2  KE2015      1.246541              0.966197  1.149503  KEN   KE6   
2        3  KE2015      1.549552              1.266355  1.369633  KEN   KE6   
3        4  KE2015      1.760900              1.480064  1.592286  KEN   KE6   
4        5  KE2015      1.190505              0.905504  0.817211  KEN   KE6   

   year cname country          region  iso3n  households    LATNUM    LONGNUM  \
0  2015    KE   Kenya  Eastern Africa    404          29 -1.262753  36.757369   
1  2015    KE   Kenya  Eastern Africa    404          17 -1.338937  36.759522   
2  2015    KE   Kenya  Eastern Africa    404          24 -1.270583  36.963737   
3  2015    KE   Kenya  Eastern Africa    404          25 -1.322515  36.903702   
4  2015    KE   Kenya  Eastern Africa    404          28 -1.318353  36.835251   

  URBAN_RURA  
0          U  
1       

In [24]:
# Check bands added successfully
print(srmedian_recent.bandNames().getInfo())

['LON', 'LAT', 'NIGHTLIGHTS', 'ELEV', 'SLO', 'ASP']


In [25]:
dhsinfo = dhs_recent.first().propertyNames().getInfo()

In [26]:
countries = dhs_recent.distinct('country').aggregate_array('country').getInfo()
for i in countries[3:4]:
    seldhs = dhs_recent.filter(ee.Filter.eq('country', i))
    
    fname = 'lx_median_2015-17_'+i+file_suffix
    print(fname)
    
    bands = selbands+dhsinfo+['LAT', 'LON']+tbands
    
    test = ee_utils.get_array_patches(
            img=srmedian_recent, scale=30, ksize=127,
            points=seldhs, export='gcs',
            prefix=GCS_FILE_PREFIX, fname=fname,
            selectors=bands,
            bucket=GCS_BUCKET)

lx_median_2015-17_Kenya_dhslocs_


In [27]:
countries = dhs_middle.distinct('country').aggregate_array('country').getInfo()
for i in countries:
    seldhs = dhs_middle.filter(ee.Filter.eq('country', i))
    fname = 'lx_median_2012-14_'+i+file_suffix
    print(fname)
    
    bands = selbands+dhsinfo+['LAT', 'LON']+tbands+nlband
    
    test = tf.get_array_patches(srmedian_middle, 30, 127, seldhs, 
                                   True, True, bands, None, 
                                   GCS_BUCKET,
                                   GCS_FILE_PREFIX, 
                                   fname)

lx_median_2012-14_senegal_dhslocs_
lx_median_2012-14_cote_d_ivoire_dhslocs_
lx_median_2012-14_mali_dhslocs_
lx_median_2012-14_benin_dhslocs_
lx_median_2012-14_guinea_dhslocs_
lx_median_2012-14_malawi_dhslocs_
lx_median_2012-14_zambia_dhslocs_
lx_median_2012-14_sierra_leone_dhslocs_
lx_median_2012-14_nigeria_dhslocs_


KeyboardInterrupt: 

In [83]:
countries = dhs_oldest.distinct('country').aggregate_array('country').getInfo()
for i in countries:
    seldhs = dhs_oldest.filter(ee.Filter.eq('country', i))
    fname = 'lx_median_2009-11_'+i+file_suffix
    print(fname)
    
    bands = selbands+dhsinfo+['LAT', 'LON']+tbands+nlband
    
    test = tf.get_array_patches(srmedian_oldest, 30, 127, seldhs, 
                                   True, True, bands, None, 
                                   GCS_BUCKET,
                                   GCS_FILE_PREFIX, 
                                   fname)

lx_median_2009-11_mozambique_dhslocs_
lx_median_2009-11_lesotho_dhslocs_


KeyboardInterrupt: 