### Create Labelbox Dataset

This Notebook finds RGB images in the GCP storage bucket. For each geographical location where images exist, this notebooks create a Labelbox dataset. It populates this dataset with pointers to the GCP bucket.

In [1]:
import os, sys
sys.path.insert(0, os.path.abspath('..'))

%load_ext autoreload
%autoreload 2

In [2]:
from google.cloud import storage
from labelbox import Client as LabelboxClient
from labelbox import Dataset


from utils import gcp_utils
from utils.labelbox_utils import create_new_dataset, create_data_row_dict



In [3]:
# Load environment variables from .env.
# Alternatively, manually set environment variables.

# from dotenv import load_dotenv
# load_dotenv()

In [4]:
from project_config import GCP_PROJECT_NAME, BUCKET_NAME

LABELBOX_API_KEY = os.getenv('LABELBOX_API_KEY')

gcp_client = storage.Client(project=GCP_PROJECT_NAME)
labelbox_client = LabelboxClient(api_key=LABELBOX_API_KEY)

## Overload bucket name if it's different from the default

In [5]:
BUCKET_NAME = "sand_mining_median"
SUBFOLDER = "labels"

In [13]:
all_label_locations = gcp_utils.list_subfolders(client=gcp_client, folder_name=SUBFOLDER, bucket_name=BUCKET_NAME)
print(all_label_locations)

['Kathajodi_Cuttack_85-85_20-44_median', 'Ken_Banda_80-35_25-68_median', 'Narmada_Sehore_77-32_22-56_median', 'Tawa_Hoshangabad_77-80_22-74_median', 'Sone_Rohtas_84-21_24-91_median', 'Chambal_More_77-86_26-61_median', 'Betwa_Hamirpur_79-81_25-91_median']


Choose the locations for which to push images to Labelbox

In [17]:
label_locations = all_label_locations
#drop any labels that have "test" in the name
label_locations = [x for x in label_locations if "test" not in x]


# label_locations = ['Tawa_Hoshangabad_77-80_22-74_median']

print (label_locations)

['Kathajodi_Cuttack_85-85_20-44_median', 'Ken_Banda_80-35_25-68_median', 'Narmada_Sehore_77-32_22-56_median', 'Tawa_Hoshangabad_77-80_22-74_median', 'Sone_Rohtas_84-21_24-91_median', 'Chambal_More_77-86_26-61_median', 'Betwa_Hamirpur_79-81_25-91_median']


## Create Labelbox Datasets for each folder

In [28]:
assert set(label_locations).issubset(set(all_label_locations))

bucket = gcp_client.bucket(BUCKET_NAME)

def create_labelbox_dataset_for_location(location, folder_name=None, key_postfix=""):
    if folder_name is None:
        folder_name = f"{location}/rgb"
    else:
        folder_name = f"{folder_name}/{location}/rgb"

    print(f"--- Folder:", folder_name)

    rgb_image_blobs = bucket.list_blobs(prefix=folder_name)

    data_rows = []
    for rgb_image_blob in rgb_image_blobs:
        public_image_url = gcp_utils.get_public_url(rgb_image_blob.name, bucket_name=BUCKET_NAME)
        # print(public_image_url)
        global_key = rgb_image_blob.name.split('/')[-1]
        #remove the .tif from the end of global_key
        global_key = global_key.split('.')[0] 

        if key_postfix != "":
            global_key = global_key + "_" + key_postfix
        #     global_key = global_key.replace(".tif", f"_{key_postfix}.tif")

        print(global_key)

        data_row = create_data_row_dict(
            img_url=public_image_url,
            global_key=global_key
        )
        data_rows.append(data_row)
    
    # print(data_rows)
    print ("Generating dataset in LabelBox")

    dataset: Dataset = create_new_dataset(labelbox_client, dataset_name=location)
    print(f"Creating {len(data_rows)} data rows in dataset {dataset.name}")
    task = dataset.create_data_rows(data_rows)
    task.wait_till_done()
    if task.errors:
        print(f"Finished with error: {task.errors}")
    else:
        print(f"Finished without error.")

for location in label_locations:
    print(f"--- Location: {location}")
    create_labelbox_dataset_for_location(location, folder_name=SUBFOLDER, key_postfix="median")

--- Location: Kathajodi_Cuttack_85-85_20-44_median
--- Folder: labels/Kathajodi_Cuttack_85-85_20-44_median/rgb
Kathajodi_Cuttack_85-85_20-44_2022-01-01_rgb_median
Kathajodi_Cuttack_85-85_20-44_2022-02-01_rgb_median
Kathajodi_Cuttack_85-85_20-44_2022-03-01_rgb_median
Kathajodi_Cuttack_85-85_20-44_2022-04-01_rgb_median
Kathajodi_Cuttack_85-85_20-44_2022-05-01_rgb_median
Kathajodi_Cuttack_85-85_20-44_2022-09-01_rgb_median
Kathajodi_Cuttack_85-85_20-44_2022-10-01_rgb_median
Kathajodi_Cuttack_85-85_20-44_2022-11-01_rgb_median
Generating dataset in LabelBox
Creating 8 data rows in dataset Kathajodi_Cuttack_85-85_20-44_median
Finished without error.
--- Location: Ken_Banda_80-35_25-68_median
--- Folder: labels/Ken_Banda_80-35_25-68_median/rgb
Ken_Banda_80-35_25-68_2022-01-01_rgb_median
Ken_Banda_80-35_25-68_2022-02-01_rgb_median
Ken_Banda_80-35_25-68_2022-03-01_rgb_median
Ken_Banda_80-35_25-68_2022-04-01_rgb_median
Ken_Banda_80-35_25-68_2022-05-01_rgb_median
Ken_Banda_80-35_25-68_2022-06-01_r

## Deleting unused datasets
Use this section to selectively delete datasets that are no longer needed.

**!USE WITH CAUTION!**

In [20]:
# datasets = list(labelbox_client.get_datasets(where=(Dataset.name=="Dummy dataset")))
datasets = list(labelbox_client.get_datasets())
# check get_datasets for anything that has 'median' in it
datasets = [ds for ds in datasets if 'median' in ds.name or 'test' in ds.name]

for ds in datasets:
    print("Deleting: ", ds.name, ds.uid)
    #Uncomment to actually delete!
    # ds.delete()

Deleting:  Betwa_Hamirpur_79-81_25-91_median cllbdw1gn0ad407zkgeiehuja
Deleting:  Chambal_More_77-86_26-61_median cllbdvyba03lp07v7134o7wf9
Deleting:  Sone_Rohtas_84-21_24-91_median cllbdvv3y0enp07wn5kwubc4v
Deleting:  Tawa_Hoshangabad_77-80_22-74_median cllbdvr6e0enl07wn3cqxbejm
Deleting:  Narmada_Sehore_77-32_22-56_median cllbdvo0i09yw07wb97r6e9en
Deleting:  Ken_Banda_80-35_25-68_median cllbdviw20a0l07ux76frgapn
Deleting:  Kathajodi_Cuttack_85-85_20-44_median cllbdveqs0a0j07uxgpda4qwo
