In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Setup 
import os
import urllib.parse
import re
from pathlib import Path
import itertools as it
from functools import partial
from concurrent.futures import ThreadPoolExecutor

from tqdm.notebook import tqdm
from radiant_mlhub import client, get_session

from Helpers import *

os.environ['MLHUB_API_KEY'] = os.environ['RADIANT_API_KEY']

In [39]:
collection_id = 'ref_landcovernet_v1_labels'

collection = client.get_collection(collection_id)
print(f'Description: {collection["description"]}')
print(f'License: {collection["license"]}')
print(f'DOI: {collection["sci:doi"]}')
print(f'Citation: {collection["sci:citation"]}')

Description: LandCoverNet Labels
License: CC-BY-4.0
DOI: 10.34911/rdnt.d2ce8i
Citation: Alemohammad S.H., Ballantyne A., Bromberg Gaber Y., Booth K., Nakanuku-Diggs L., & Miglarese A.H. (2020) "LandCoverNet: A Global Land Cover Classification Training Dataset", Version 1.0, Radiant MLHub. [Date Accessed] https://doi.org/10.34911/rdnt.d2ce8i


In [40]:
items = client.list_collection_items(collection_id, limit=1)

first_item = next(items)
all_classes = []

label_classes = first_item['properties']['label:classes']
for label_class in label_classes:
    print("Classes for {}".format(label_class["name"]))
    for c in sorted(label_class['classes']):
        print("f - {}".format(c))
        all_classes.append(c)

Classes for labels
f - (Semi) Natural Vegetation
f - Artificial Bareground
f - Cultivated Vegetation
f - Natural Bareground
f - No Data
f - Permanent Snow/Ice
f - Water
f - Woody Vegetation


In [37]:
items_pattern = re.compile(r'^/mlhub/v1/collections/(\w+)/items/(\w+)$')

def filter_item(item, classes=None, cloud_and_shadow=None, seasonal_snow=None):
    item_labels = item['properties'].get('labels',[])
    if classes is not None and not any(label in classes for label in item_labels):
        return False

    item_cloud_and_shadow = item['properties'].get('cloud_and_shadow','false') == 'true'
    if cloud_and_shadow is not None and item_cloud_and_shadow != cloud_and_shadow:
        return False

    item_seasonal_snow = item['properties'].get('seasonal_snow','false') == 'true'
    if seasonal_snow is not None and item_seasonal_snow != seasonal_snow:
        return False

    return True

In [36]:
def get_items(collection_id, classes=None, cloud_and_shadow=None, seasonal_snow=None, max_items=1):
    filter_fn = partial(filter_item, classes=classes, cloud_and_shadow=cloud_and_shadow, seasonal_snow=seasonal_snow)
    filtered=filter(filter_fn, client.list_collection_items(collection_id, limit=None))
    
    yield from it.islice(filtered, max_items)

In [35]:
def download(item, asset_key, output_dir='./data'):
    asset = item.get('assets', {}).get(asset_key)
    if asset is None:
        print(f'Asset "{asset_key}" does not exist in this item')
        return None
    
    download_url = asset.get('href')
    if download_url is None:
        print(f'Asset {asset_key} does not have an "href" property, cannot download.')
        return None
    
    session = get_session()
    r = session.get(download_url, allow_redirects=True, stream=True)
    
    filename = urllib.parse.urlsplit(r.url).path.split('/')[-1]
    output_path = Path(output_dir) / filename

    
    with output_path.open('wb') as dst:
        for chunk in r.iter_content(chunk_size=512 * 1024):
            if chunk:
                dst.write(chunk)

In [34]:
def download_labels_and_source(item, assets=None, output_dir='./data'):
    
    def _get_download_args(link):
        source_item_path = urllib.parse.urlsplit(link['href']).path
        source_item_collection, source_item_id = items_pattern.fullmatch(source_item_path).groups()
        source_item = client.get_collection_item(source_item_collection, source_item_id)

        source_download_dir = download_dir / 'source'
        source_download_dir.mkdir(exist_ok=True)
        
        matching_source_assets = [
            asset 
            for asset in source_item.get('assets', {}) 
            if assets is None or asset in assets
        ] 
        return [
            (source_item, asset, source_download_dir) 
            for asset in matching_source_assets
        ]

    
    download_args = []
    
    download_dir = Path(output_dir) / item['id']
    download_dir.mkdir(parents=True, exist_ok=True)
    
    labels_download_dir = download_dir / 'labels'
    labels_download_dir.mkdir(exist_ok=True)

    matching_assets = [
        asset 
        for asset in item.get('assets', {}) 
        if assets is None or asset in assets
    ]

    for asset in matching_assets:
        download_args.append((item, asset, labels_download_dir))
        
    source_links = [link for link in item['links'] if link['rel'] == 'source']
    
    with ThreadPoolExecutor(max_workers=16) as executor:
        for argument_batch in executor.map(_get_download_args, source_links):
            download_args += argument_batch
        
    print(f'Downloading {len(download_args)} assets...')
    with ThreadPoolExecutor(max_workers=16) as executor:
        with tqdm(total=len(download_args)) as pbar:
            for _ in executor.map(lambda triplet: download(*triplet), download_args):
                pbar.update(1)

In [22]:
trainItems = [get_items(
    collection_id,
    classes=[x],
    max_items=1
) for x in all_classes]
for item in trainItems:
    for i in item:
        download_labels_and_source(i, assets=['labels','B02','B03','B04','B08','source_dates','CLD','SCL'], output_dir='./data/train')

Downloading 440 assets...


  0%|          | 0/440 [00:00<?, ?it/s]

Downloading 428 assets...


  0%|          | 0/428 [00:00<?, ?it/s]

Downloading 428 assets...


  0%|          | 0/428 [00:00<?, ?it/s]

Downloading 434 assets...


  0%|          | 0/434 [00:00<?, ?it/s]

Downloading 446 assets...


  0%|          | 0/446 [00:00<?, ?it/s]

Downloading 440 assets...


  0%|          | 0/440 [00:00<?, ?it/s]

Downloading 440 assets...


  0%|          | 0/440 [00:00<?, ?it/s]

In [69]:
from Helpers import FindTilesAndChips
tcs = FindTilesAndChips('data/train')
allCounts = {key: 0 for key in ClassIntMap.values()}
for tile,chip in tcs:
    d = DateCSVParser(tile,chip,prefix='data/train')
    dates = d.GetDates()
    for ind,date in dates:
        counts = CountClassLabels(tile, chip, date, prefix='data/train')
        for label, num in counts:
            allCounts[label] += num
print(allCounts)

{'Water': 0, 'Artificial Bareground': 89737, 'Natural Bareground': 26148, 'Permanent Snow/Ice': 1625798, 'Woody Vegetation': 5141368, 'Cultivated Vegetation': 940987, '(Semi) Natural Vegetation': 141605, 'No Data': 437853}


In [47]:
testItems = get_items(
    collection_id,
    classes=None,
    max_items=1
)
for item in testItems:
    download_labels_and_source(item, assets=['labels','B02','B03','B04','B08','source_dates','CLD','SCL'], output_dir='./data/test')

Downloading 440 assets...


  0%|          | 0/440 [00:00<?, ?it/s]

Downloading 428 assets...


  0%|          | 0/428 [00:00<?, ?it/s]

In [67]:
print(item.keys())
print(item['collection'])
print(item['id'])
print(item['properties'])

dict_keys(['assets', 'bbox', 'collection', 'geometry', 'id', 'links', 'properties', 'stac_extensions', 'stac_version', 'type'])
ref_landcovernet_v1_labels
ref_landcovernet_v1_labels_29PKL_10
{'datetime': '2018-07-01T00:00:00Z', 'label:classes': [{'classes': ['No Data', 'Water', 'Artificial Bareground', 'Natural Bareground', 'Permanent Snow/Ice', 'Woody Vegetation', 'Cultivated Vegetation', '(Semi) Natural Vegetation'], 'name': 'labels'}], 'label:description': 'Land Cover Type Classification', 'label:properties': ['labels'], 'label:type': 'raster', 'labels': ['Water', 'Artificial Bareground', 'Natural Bareground', 'Woody Vegetation', 'Cultivated Vegetation', '(Semi) Natural Vegetation'], 'validation:attemptedExtensions': ['label'], 'validation:errors': []}
