# How to access open Earth observation training data
### **Assessment One**

#1. Authentication and API properties

Enter your Radiant MLHub access token and API properties

In [None]:
# only the requests module is required to access the API at this stage
import requests

API_BASE = 'http://api.radiant.earth/mlhub/v1'

# copy your access token from dashboard.mlhub.earth and paste it in the following
ACCESS_TOKEN = 'PASTE THE TOKEN FROM YOUR RADIANT MLHUB DASHBOARD HERE'

# these headers will be used in each request
headers = {
    'Authorization': f'Bearer {ACCESS_TOKEN}',
    'Accept':'application/json'
}

#2. Get List of all collections

To see what training data is available, you will want to see the collections available through the API

A collection represents the top-most data level. Typically this means the data comes from the same source for the same geography. It might include different years or sub-geographies.

To see the list, simply run the following cell. The returned list shows the collection id values, collection license, and data source citation (if available)

In [None]:
# get list of all collections
r = requests.get(f'{API_BASE}/collections', headers=headers)
h = r.json()
collections = h['collections']

# print the list of collections 
for c in collections:
    print(f'ID:       {c["id"]}\nLicense:  {c.get("license", "N/A")}\nCitation: {c.get("sci:citation", "N/A")}\n')

### Retrieve properties of the collection

The code below will make a request to the API requesting the properties for the specific collection we want to download from. The code below prints out a few important properties

In [None]:

# paste the id of the collection you are interested in here:
COLLECTION_ID = 'ref_african_crops_uganda_01'

# use these optional parameters to control what items are returned. maximum limit is 10000

limit = 100
bounding_box = []
date_time = []

r = requests.get(f'{API_BASE}/collections/{COLLECTION_ID}', params = {'limit':limit, 'bbox':bounding_box, 'datetime':date_time}, headers = headers)
collection = r.json()

print(f'Description: {collection["description"]}')

print(f'License: {collection["license"]}')

print(f'Citation: {collection["sci:citation"]}')

### Selecting an Item to Download

Let's select just one item.

In [None]:
collection = r.json()
selected_item = None
assets = None
for feature in collection.get('features', []):
    selected_item = feature
    assets = list(feature.get('assets').keys())
    # For demo purposes we only want the first item
    break

#3. Downloading Assets

We'll need to set up some functions to download items.

In [None]:
# First, we need to install the arrow package which will be required by some 
# functions in the code below. Other packages have already been installed.
!pip install arrow

In [None]:
!pip install boto3

# **Note: Kindly restart your runtime to initiate the newly installed libraries.**

In [None]:
import boto3 # Required to download assets hosted on S3
import os
from urllib.parse import urlparse
import arrow
from multiprocessing.pool import ThreadPool
from tqdm import tqdm

s3 = boto3.client('s3')

def download_s3(uri, path):
    parsed = urlparse(uri)
    bucket = parsed.netloc
    key = parsed.path[1:]
    s3.download_file(bucket, key, os.path.join(path, key.split('/')[-1]))
    
def download_http(uri, path):
    parsed = urlparse(uri)
    r = requests.get(uri)
    f = open(os.path.join(path, parsed.path.split('/')[-1]), 'wb')
    for chunk in r.iter_content(chunk_size=512 * 1024): 
        if chunk:
            f.write(chunk)
    f.close()

def get_download_uri(uri):
    r = requests.get(uri, allow_redirects=False)
    return r.headers['Location']

def download(d):
    href = d[0]
    path = d[1]
    download_uri = get_download_uri(href)
    parsed = urlparse(download_uri)
    
    if parsed.scheme in ['s3']:
        download_s3(download_uri, path)
    elif parsed.scheme in ['http', 'https']:
        download_http(download_uri, path)
        
def get_source_item_assets(args):
    path = args[0]
    href = args[1]
    asset_downloads = []
    try:
        r = requests.get(href, headers=headers)
    except:
        print('ERROR: Could Not Load', href)
        return []
    dt = arrow.get(r.json()['properties']['datetime']).format('YYYY_MM_DD')
    asset_path = os.path.join(path, dt)
    if not os.path.exists(asset_path):
        os.makedirs(asset_path)

    for key, asset in r.json()['assets'].items():
        asset_downloads.append((asset['href'], asset_path))
        
    return asset_downloads

def download_source_and_labels(item):
    labels = item.get('assets').get('labels')
    links = item.get('links')
    
    # Make the directory to download the files to
    path = f'uganda-crop-data/{item["id"]}/'
    if not os.path.exists(path):
        os.makedirs(path)
    
    source_items = []
    
    # Download the source imagery
    for link in links:
        if link['rel'] != 'source':
            continue
        source_items.append((path, link['href']))
        
    results = p.map(get_source_item_assets, source_items)
    results.append([(labels['href'], path)])
            
    return results

def get_items(uri, classes=None, max_items_downloaded=None, items_downloaded=0, downloads=[]):
    print('Loading', uri, '...')
    r = requests.get(uri, headers=headers)
    collection = r.json()
    for feature in collection.get('features', []):
        # Check if the item has one of the label classes we're interested in
        matches_class = True
        if classes is not None:
            matches_class = False
            for label_class in feature['properties'].get('labels', []):
                if label_class in classes:
                    matches_class = True
                    break
            
        # If the item does not match all of the criteria we specify, skip it
        if not matches_class:
            continue
        
        print('Getting Source Imagery Assets for', feature['id'])
        # Download the label and source imagery for the item
        downloads.extend(download_source_and_labels(feature))
        
        # Stop downloaded items if we reached the maximum we specify
        items_downloaded += 1
        if max_items_downloaded is not None and items_downloaded >= max_items_downloaded:
            return downloads
        
    # Get the next page if results, if available
    for link in collection.get('links', []):
        if link['rel'] == 'next' and link['href'] is not None:
            get_items(link['href'], classes=classes, max_items_downloaded=max_items_downloaded, items_downloaded=items_downloaded, downloads=downloads)
    
    return downloads

# 4. Download labels and source imagery 

The function below will navigate the API and collect all the download links for labels and source imagery assets. In this function, we specified the "max_items_downloaded" argument which limits the number of label items downloaded. By removing this argument, you can download all of the label items which match the criteria you specify.

In [None]:
p = ThreadPool(20)
to_download = get_items(f'{API_BASE}/collections/{COLLECTION_ID}/items?limit=100',
                        max_items_downloaded=1, downloads=[])
for d in tqdm(to_download):
    p.map(download, d)