In [None]:
!pip -qq install radiant_mlhub
!pip -qq install rasterio

[?25l[K     |████▌                           | 10 kB 24.7 MB/s eta 0:00:01[K     |█████████                       | 20 kB 30.7 MB/s eta 0:00:01[K     |█████████████▌                  | 30 kB 22.2 MB/s eta 0:00:01[K     |██████████████████              | 40 kB 12.7 MB/s eta 0:00:01[K     |██████████████████████▌         | 51 kB 14.9 MB/s eta 0:00:01[K     |███████████████████████████     | 61 kB 5.4 MB/s eta 0:00:01[K     |███████████████████████████████▋| 71 kB 6.2 MB/s eta 0:00:01[K     |████████████████████████████████| 72 kB 981 kB/s 
[?25h[?25l[K     |█████▍                          | 10 kB 33.0 MB/s eta 0:00:01[K     |██████████▊                     | 20 kB 37.3 MB/s eta 0:00:01[K     |████████████████                | 30 kB 46.5 MB/s eta 0:00:01[K     |█████████████████████▍          | 40 kB 53.4 MB/s eta 0:00:01[K     |██████████████████████████▊     | 51 kB 58.4 MB/s eta 0:00:01[K     |████████████████████████████████| 61 kB 7.0 MB/s 
[?25h[?25l

In [None]:
# Required libraries
import os
import tarfile
import json
import pandas as pd
import numpy as np
from pathlib import Path
import shutil
from radiant_mlhub.client import _download as download_file
import rasterio
from rasterio.plot import show
import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedShuffleSplit

from joblib import Parallel, delayed

pd.set_option('display.max_colwidth', None)
os.environ['MLHUB_API_KEY'] = 'N/A'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd

/content


In [None]:
os.mkdir('Radiant_Data')

## Download Options

By editing the cell below, you can chose which bands of the Sentinel-2 imagery to download and whether or not to download the Sentinel-1 data.

In [None]:
# DOWNLOAD_S1 = True # If you set this to true then the Sentinel-1 data will be downloaded

# Select which imagery bands you'd like to download here
# Download data in batches
DOWNLOAD_BANDS_1 = {'B01': True,'B02': True,'B03': True,'B04': False,'B05': False,'B06': False,'B07': False,'B08': False,'B8A': False,\
                  'B09': False,'B11': False,'B12': False,'CLM': False}

DOWNLOAD_BANDS_2 = {'B01': False,'B02': False,'B03': False,'B04': True,'B05': True,'B06': True,'B07': False,'B08': False,'B8A': False,\
                  'B09': False,'B11': False,'B12': False,'CLM': False}

DOWNLOAD_BANDS_3 = {'B01': False,'B02': False,'B03': False,'B04': False,'B05': False,'B06': False,'B07': True,'B08': True,'B8A': True,\
                  'B09': False,'B11': False,'B12': False,'CLM': False}    

DOWNLOAD_BANDS_4 = {'B01': False,'B02': False,'B03': False,'B04': False,'B05': False,'B06': False,'B07': False,'B08': False,'B8A': False,\
                  'B09': True,'B11': True,'B12': True,'CLM': True}    
            

Downloading Datasets and Loading Asset File Paths into a Pandas Dataframe
===

The cells in this notebook will show you how to download all of the datasets for this competition and read the STAC metadata into a pandas dataframe. There will be two dataframes, one for train and one for test, which contain all of the information you will need to filter based off datetime, satellite platform, and asset type. Contained in each row of the dataframe is also the file path for that asset being described. Assets which have a `None` value for the  `datetime` and `satellite_platform` columns are assets which are related to the label item.

In [None]:
%%time
# Download the data in batches to avoid using all RAM
FOLDER_BASE = 'ref_south_africa_crops_competition_v1'
for down_bands, down_s1, file_no in zip([DOWNLOAD_BANDS_1, DOWNLOAD_BANDS_2, DOWNLOAD_BANDS_3, DOWNLOAD_BANDS_4], [True, False, False, False], [1, 2, 3, 4]):
  DOWNLOAD_BANDS = down_bands.copy()
  DOWNLOAD_S1 = down_s1
  def download_archive(archive_name):
      if os.path.exists(archive_name.replace('.tar.gz', '')):
          return
      
      print(f'Downloading {archive_name} ...')
      download_url = f'https://radiant-mlhub.s3.us-west-2.amazonaws.com/archives/{archive_name}'
      download_file(download_url, '.')
      print(f'Extracting {archive_name} ...')
      with tarfile.open(archive_name) as tfile:
          tfile.extractall()
      os.remove(archive_name)

  for split in ['train', 'test']:
      # Download the labels
      labels_archive = f'{FOLDER_BASE}_{split}_labels.tar.gz'
      download_archive(labels_archive)
      
      # Download Sentinel-1 data
      if DOWNLOAD_S1:
          s1_archive = f'{FOLDER_BASE}_{split}_source_s1.tar.gz'
          download_archive(s1_archive)
          

      for band, download in DOWNLOAD_BANDS.items():
          if not download:
              continue
          s2_archive = f'{FOLDER_BASE}_{split}_source_s2_{band}.tar.gz'
          download_archive(s2_archive)
          
  def resolve_path(base, path):
      return Path(os.path.join(base, path)).resolve()
          
  def load_df(collection_id):
      split = collection_id.split('_')[-2]
      collection = json.load(open(f'{collection_id}/collection.json', 'r'))
      rows = []
      item_links = []
      for link in collection['links']:
          if link['rel'] != 'item':
              continue
          item_links.append(link['href'])
          
      for item_link in item_links:
          item_path = f'{collection_id}/{item_link}'
          current_path = os.path.dirname(item_path)
          item = json.load(open(item_path, 'r'))
          tile_id = item['id'].split('_')[-1]
          for asset_key, asset in item['assets'].items():
              rows.append([
                  tile_id,
                  None,
                  None,
                  asset_key,
                  str(resolve_path(current_path, asset['href']))
              ])   
          for link in item['links']:
              if link['rel'] != 'source':
                  continue
              source_item_id = link['href'].split('/')[-2]
              
              if source_item_id.find('_s1_') > 0 and not DOWNLOAD_S1:
                  continue
              elif source_item_id.find('_s1_') > 0:
                  for band in ['VV', 'VH']:
                      asset_path = Path(f'{FOLDER_BASE}_{split}_source_s1/{source_item_id}/{band}.tif').resolve()
                      date = '-'.join(source_item_id.split('_')[10:13])
                      
                      rows.append([
                          tile_id,
                          f'{date}T00:00:00Z',
                          's1',
                          band,
                          asset_path
                      ])
                  
              if source_item_id.find('_s2_') > 0:
                  for band, download in DOWNLOAD_BANDS.items():
                      if not download:
                          continue
                      
                      asset_path = Path(f'{FOLDER_BASE}_{split}_source_s2_{band}/{source_item_id}_{band}.tif').resolve()
                      date = '-'.join(source_item_id.split('_')[10:13])
                      rows.append([
                          tile_id,
                          f'{date}T00:00:00Z',
                          's2',
                          band,
                          asset_path
                      ])
              
      return pd.DataFrame(rows, columns=['tile_id', 'datetime', 'satellite_platform', 'asset', 'file_path'])

  train_df = load_df(f'{FOLDER_BASE}_train_labels')
  test_df = load_df(f'{FOLDER_BASE}_test_labels')
  train_df.to_csv(f'train{file_no}.csv', index = False)
  test_df.to_csv(f'test{file_no}.csv', index = False)

# 

Downloading ref_south_africa_crops_competition_v1_train_labels.tar.gz ...


  0%|          | 0/31.4 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_labels.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s1.tar.gz ...


  0%|          | 0/5987.8 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s1.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B01.tar.gz ...


  0%|          | 0/1683.3 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B01.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B02.tar.gz ...


  0%|          | 0/5241.7 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B02.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B03.tar.gz ...


  0%|          | 0/5775.1 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B03.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_labels.tar.gz ...


  0%|          | 0/10.9 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_labels.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s1.tar.gz ...


  0%|          | 0/2566.1 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s1.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B01.tar.gz ...


  0%|          | 0/713.1 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B01.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B02.tar.gz ...


  0%|          | 0/2226.4 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B02.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B03.tar.gz ...


  0%|          | 0/2454.4 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B03.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B04.tar.gz ...


  0%|          | 0/6363.4 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B04.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B05.tar.gz ...


  0%|          | 0/4536.5 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B05.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B06.tar.gz ...


  0%|          | 0/4652.6 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B06.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B04.tar.gz ...


  0%|          | 0/2706.0 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B04.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B05.tar.gz ...


  0%|          | 0/1930.4 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B05.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B06.tar.gz ...


  0%|          | 0/1980.9 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B06.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B07.tar.gz ...


  0%|          | 0/4702.1 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B07.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B08.tar.gz ...


  0%|          | 0/6755.8 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B08.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B8A.tar.gz ...


  0%|          | 0/4704.1 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B8A.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B07.tar.gz ...


  0%|          | 0/2002.0 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B07.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B08.tar.gz ...


  0%|          | 0/2877.3 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B08.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B8A.tar.gz ...


  0%|          | 0/2003.0 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B8A.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B09.tar.gz ...


  0%|          | 0/2136.7 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B09.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B11.tar.gz ...


  0%|          | 0/4588.5 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B11.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B12.tar.gz ...


  0%|          | 0/4525.1 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B12.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_CLM.tar.gz ...


  0%|          | 0/24.3 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_CLM.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B09.tar.gz ...


  0%|          | 0/910.7 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B09.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B11.tar.gz ...


  0%|          | 0/1950.2 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B11.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B12.tar.gz ...


  0%|          | 0/1922.3 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B12.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_CLM.tar.gz ...


  0%|          | 0/10.4 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_CLM.tar.gz ...
CPU times: user 40min 4s, sys: 16min 14s, total: 56min 19s
Wall time: 45min 22s


In [None]:
# Check files in colab directory
# Remove irrelevant files and retain only radiant data
import glob
all = glob.glob('/content/*')
all.remove('/content/drive')
all.remove('/content/sample_data')

all

['/content/ref_south_africa_crops_competition_v1_train_labels',
 '/content/train4.csv',
 '/content/ref_south_africa_crops_competition_v1_test_source_s1',
 '/content/train1.csv',
 '/content/test1.csv',
 '/content/ref_south_africa_crops_competition_v1_test_source_s2_B12',
 '/content/ref_south_africa_crops_competition_v1_train_source_s2_B03',
 '/content/test3.csv',
 '/content/ref_south_africa_crops_competition_v1_test_source_s2_B02',
 '/content/ref_south_africa_crops_competition_v1_train_source_s2_B01',
 '/content/ref_south_africa_crops_competition_v1_train_source_s2_B05',
 '/content/ref_south_africa_crops_competition_v1_test_source_s2_B11',
 '/content/train2.csv',
 '/content/ref_south_africa_crops_competition_v1_test_source_s2_B8A',
 '/content/ref_south_africa_crops_competition_v1_test_source_s2_B08',
 '/content/ref_south_africa_crops_competition_v1_train_source_s2_B04',
 '/content/ref_south_africa_crops_competition_v1_test_source_s2_B01',
 '/content/ref_south_africa_crops_competition_v1

In [None]:
# Zip all all data to one file
%%time
shutil.make_archive('/content/Radiant_Data', 'zip', '/content/Radiant_Data')

CPU times: user 55min 39s, sys: 6min 1s, total: 1h 1min 40s
Wall time: 1h 33min 57s


'/content/Radiant_Data.zip'

In [None]:
import gc
gc.collect()

In [None]:
# Move downloaded data to gdrive
%%time
!mv '/content/Radiant_Data.zip' '/content/drive/MyDrive/CompeData/Radiant'

CPU times: user 6.14 s, sys: 903 ms, total: 7.04 s
Wall time: 16min 21s
