# Add new labeled data 🛰️

TODO: Generate url from config
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nasaharvest/openmapflow/blob/main/crop-mask-example/notebooks/new_data.ipynb)

**Description:** Stand alone notebook for adding new training and evaluation data. 

# 1. Setup

If you don't already have one, obtain a Github Personal Access Token using the steps [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token). Save this token somewhere private.

In [1]:
try:
    from google.colab import auth
    from google.colab import files
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    email = input("Github email: ")
    username = input("Github username: ")

    !git config --global user.email $username
    !git config --global user.name $email

    from getpass import getpass
    token = getpass('Github Personal Access Token:')

    # TODO: Generate below two lines from config
    !git clone https://$username:$token@github.com/nasaharvest/openmapflow.git
    !cd openmapflow && pip install -r requirements.txt -q
    %cd openmapflow/crop-mask-example
else:
    print("Running notebook outside Google Colab. Assuming in local repository.")
    !cd ../.. && pip install -r requirements.txt -q
    !pip install earthengine-api google-auth -q
    %cd ..

Running notebook outside Google Colab. Assuming in local repository.
You should consider upgrading via the '/Users/izvonkov/nasaharvest/openmapflow/venv/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/izvonkov/nasaharvest/openmapflow/venv/bin/python -m pip install --upgrade pip' command.[0m
/Users/izvonkov/nasaharvest/openmapflow/crop-mask-example


In [2]:
from pathlib import Path
from importlib import reload
from ipywidgets import Box
from tqdm.notebook import tqdm

import ipywidgets as widgets
import os
import sys

# Needed when openmapflow installed locally
sys.path.append("..")

# Generate import statements
import datasets as ds
from openmapflow.main import create_features
from openmapflow.config import FULL_PATHS

In [3]:
box_layout = widgets.Layout(flex_flow='column')

options = ["Add new labels", "Check progress of previously uploaded labels"]
use = widgets.RadioButtons(
    options=options,
    style= {'description_width': 'initial'},
    value=options[0],
    description='',
    disabled=False
)

branches_available = []
for branch in os.popen('git branch -r').read().split("\n"):
    if branch == "":
        continue
    branches_available.append(branch.strip().replace("origin/", ""))

new_branch = widgets.Text(description='Enter a new branch name',
                        style={'description_width': 'initial'})
existing_branch = widgets.Dropdown(options=branches_available, 
                              description="Branch with existing labels",
                              style={'description_width': 'initial'})
existing_branch.layout.visibility = "hidden"

def change_visibility(event):
    try:
        i = event["new"]["index"]  
    except:
        return
    show_new = i == 0
    existing_branch.layout.visibility = "hidden" if show_new else "visible" 
    new_branch.layout.display = "block" if show_new else "none"

use.observe(change_visibility)
Box(children=[use, new_branch, existing_branch], layout=box_layout)

Box(children=(RadioButtons(options=('Add new labels', 'Check progress of previously uploaded labels'), style=D…

In [4]:
checking_progress_only = new_branch.value == ""
if checking_progress_only:
    !git checkout {existing_branch.value}
    !git pull
else:
    !git checkout -b'{new_branch.value}'

M	crop-mask-example/notebooks/new_data.ipynb
M	openmapflow/main.py
Already on 'main'
Your branch is up to date with 'origin/main'.
Already up to date.


# 2. Download latest data
Data is stored in remoet storage (ie. Google Drive) so authentication is necessary.

In [None]:
for path_key in tqdm(["raw", "processed", "compressed_features"]):
    !dvc pull {RELATIVE_PATHS[path_key]} -q -f

!tar -xzf {RELATIVE_PATHS["compressed_features"]} -C data

# 3. Upload labels

In [5]:
if checking_progress_only:
    print("Checking progress only, skipping this cell.")
else:
    dataset_name = input("Dataset name (suggested format: <Country_Region_Year>): ")
    while True:
        dataset_dir = FULL_PATHS["raw"] / dataset_name
        if dataset_dir.exists() and len(list(dataset_dir.iterdir())) > 0:
            dataset_name = input("Dataset name already exists, try a different name: ")
        else:
            dataset_dir.mkdir(exist_ok=True)
            break

    print("--------------------------------------------------")
    print(f"Dataset: {dataset_name} directory created")
    print("---------------------------------------------------")
    
    if IN_COLAB:
        uploaded = files.upload()

        for file_name in uploaded.keys():
            Path(file_name).rename(dataset_dir / file_name)
    else:
        print(f"Please add file(s) into {dataset_dir}")

Checking progress only, skipping this cell.


# 4. Create features

TODO: Update screenshot

If you just uploaded new labels, open datasets_labeled.py and add a `LabeledDataset` object similar to the ones that already exist.

If you are checking progress, scroll on.

In [6]:
if checking_progress_only:
    print("Checking progress only, skipping this cell.")
else:
    amount_of_datasets_before = len(ds.datasets)
    print(f"Datasets before: {amount_of_datasets_before}")
    reload(ds)
    amount_of_datasets_after = len(ds.datasets)
    print(f"Datasets after: {amount_of_datasets_after}")
    assert amount_of_datasets_after > amount_of_datasets_before, "The datasets_labeled.py was not updated."
    print("Dataset addition successful!")

Checking progress only, skipping this cell.


<img src="https://storage.googleapis.com/harvest-public-assets/openmapflow/new_data.png"/>


`create_features` creates features from labels and earth observation data.

It first checks if the necessary earth observation data is already available in Cloud Storage, or if an active Earth Engine task is already active. So Google Cloud and Earth Engine authentication is needed.

In [None]:
# TODO figure out public bucket permissions
# Login to earthengine
!earthengine authenticate

if IN_COLAB:
    # Authenticate Google, to access geotifs
    auth.authenticate_user()
    
# TODO: check if alternative authentication is needed for local

In [7]:
# Create / check progress of feature creation
create_features(ds.datasets)

------------------------------
geowiki_landcover_2017
------------------------------
Togo_2019
------------------------------
Loading all features...
✔ Found no empty features
✔ No duplicates found
Compressing features...


In [8]:
# Changes since running the last cell
!git diff {RELATIVE_PATHS["datasets"]}

# 4. Pushing the new data to the repository

In [9]:
for path_key in tqdm(["raw", "processed", "compressed_features"]):
    !dvc commit {RELATIVE_PATHS[path_key]} -f -q
!dvc push

  0%|          | 0/3 [00:00<?, ?it/s]

  0% Transferring|                                   |0/1 [00:00<?,     ?file/s]
![A
  0%|          |1e54e8d431e7b8db8cb196f2bfd223     0.00/? [00:00<?,        ?B/s][A
  0%|          |1e54e8d431e7b8db8cb196f2bfd223 0.00/52.0M [00:00<?,        ?B/s][A
  0%|          |1e54e8d431e7b8db8cb196f28.00k/52.0M [00:01<2:30:18,    6.05kB/s][A
  5%|▌         |1e54e8d431e7b8db8cb196f2bf2.65M/52.0M [00:01<00:19,    2.64MB/s][A
 13%|█▎        |1e54e8d431e7b8db8cb196f2bf6.62M/52.0M [00:01<00:06,    7.31MB/s][A
 14%|█▍        |1e54e8d431e7b8db8cb196f2bf7.20M/52.0M [00:01<00:08,    5.79MB/s][A
 23%|██▎       |1e54e8d431e7b8db8cb196f2bf11.9M/52.0M [00:01<00:03,    12.1MB/s][A
 26%|██▌       |1e54e8d431e7b8db8cb196f2bf13.6M/52.0M [00:02<00:03,    12.8MB/s][A
 28%|██▊       |1e54e8d431e7b8db8cb196f2bf14.7M/52.0M [00:02<00:03,    10.5MB/s][A
 35%|███▌      |1e54e8d431e7b8db8cb196f2bf18.4M/52.0M [00:02<00:02,    15.7MB/s][A
 38%|███▊      |1e54e8d431e7b8db8cb196f2bf19.5M/52.0M [00:02<00:02,    11.

In [None]:
# Push changes to github
commit_message = input("Commit message: ")
!git add .
!git commit -m '{commit_message}'
!git push 

Create a Pull Request so the data can be merged into the main branch.