### Create Labelbox Dataset

This Notebook finds RGB images in the GCP storage bucket. For each geographical location where images exist, this notebooks create a Labelbox dataset. It populates this dataset with pointers to the GCP bucket.

In [18]:
import os, sys
sys.path.insert(0, os.path.abspath('..'))

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
from google.cloud import storage
from labelbox import Client as LabelboxClient
from labelbox import Dataset


from utils import gcp_utils
from utils.labelbox_utils import create_new_dataset, create_data_row_dict, check_if_dataset_exists


In [20]:
# Load environment variables from .env.
# Alternatively, manually set environment variables.

# from dotenv import load_dotenv
# load_dotenv()

In [21]:
from project_config import GCP_PROJECT_NAME, BUCKET_NAME
LABELBOX_API_KEY = os.getenv('LABELBOX_API_KEY')

gcp_client = storage.Client(project=GCP_PROJECT_NAME)
labelbox_client = LabelboxClient(api_key=LABELBOX_API_KEY)

## Overload bucket name if it's different from the default

In [22]:
BUCKET_NAME = "sand_mining_median"
SUBFOLDER = "labels"

In [23]:
all_label_locations = gcp_utils.list_subfolders(client=gcp_client, folder_name=SUBFOLDER, bucket_name=BUCKET_NAME)
print(all_label_locations, end='\n\n')

['Orsang_ChhotaUdepur_73-84_22-37_median', 'test_test_77-71_13-02_median', 'Mayurakshi_Birbhum_87-66_23-61_median', 'Penna_Nellore_79-78_14-56_median', 'Kanhan_Nagpur_79-17_21-27_median', 'Kathajodi_Cuttack_85-85_20-44_median', 'Sone_Arwal_84-52_25-17_median', 'Betwa_Jalaun_79-49_25-84_median', 'Balason_Darjeeling_88-32_26-75_median', 'Waiganga_Gondiya_80-11_21-59_median', 'Teesta_Jalpaiguri_88-6_26-84_median', 'Sutlej_Rupnagar_76-41_30-98_median', 'Mahananda_UttarDinajpur_88-25_26-46_median', 'Ganges_Patna_85-23_25-62_median', 'Orsang_ChhotaUdepur_73-74_22-29_median', 'Narmada_Sehore_77-31_22-6_median', 'Waiganga_Gondiya_80-16_21-62_median', 'Betwa_Jalaun_79-79_25-89_median', 'Godavari_EastGodavari_81-84_16-66_median', 'Godavari_EastGodavari_81-78_16-9_median', 'Kanhan_Nagpur_78-91_21-53_median', 'Ganges_Patna_85-1_25-66_median', 'Godavari_EastGodavari_81-05_17-62_median', 'Mahananda_Jalpaiguri_88-4_26-68_median', 'Yamuna_Fatehpur_81-1_25-48_median', 'Mahananda_Jalpaiguri_88-27_26-82_

## Iterate over all label locations, and keep only the ones that have not been uploaded to labelbox yet
Use the function check_if_dataset_exists(..)

In [24]:

label_locations_to_upload = \
    [label_location for label_location in all_label_locations if not check_if_dataset_exists(labelbox_client, label_location)]

print("These locations will be uploaded to LabelBox: \n", label_locations_to_upload)

These locations will be uploaded to LabelBox: 
 ['test_test_77-71_13-02_median', 'Sutlej_Rupnagar_76-41_30-98_median', 'Yamuna_Fatehpur_81-1_25-48_median', 'Yamuna_Fatehpur_80-8_25-66_median', 'Yamuna_Fatehpur_81-44_25-31_median', 'Yamuna_Banda_80-5_25-81_median']


Choose the locations for which to push images to Labelbox

In [25]:
# If you want to manually override which locations get uploaded
label_locations_to_upload = ['Sutlej_Rupnagar_76-41_30-98_median', 'Yamuna_Fatehpur_81-1_25-48_median', 'Yamuna_Fatehpur_80-8_25-66_median', 'Yamuna_Fatehpur_81-44_25-31_median', 'Yamuna_Banda_80-5_25-81_median']


## Create Labelbox Datasets for each folder

In [26]:
assert set(label_locations_to_upload).issubset(set(all_label_locations))

bucket = gcp_client.bucket(BUCKET_NAME)

def create_labelbox_dataset_for_location(location, folder_name=None, key_postfix=""):
    if folder_name is None:
        folder_name = f"{location}/rgb"
    else:
        folder_name = f"{folder_name}/{location}/rgb"

    print(f"--- Folder:", folder_name)

    rgb_image_blobs = bucket.list_blobs(prefix=folder_name)

    data_rows = []
    for rgb_image_blob in rgb_image_blobs:
        public_image_url = gcp_utils.get_public_url(rgb_image_blob.name, bucket_name=BUCKET_NAME)
        # print(public_image_url)
        global_key = rgb_image_blob.name.split('/')[-1]
        #remove the .tif from the end of global_key
        global_key = global_key.split('.')[0] 

        if key_postfix != "":
            global_key = global_key + "_" + key_postfix
        #     global_key = global_key.replace(".tif", f"_{key_postfix}.tif")

        print(global_key)

        data_row = create_data_row_dict(
            img_url=public_image_url,
            global_key=global_key,
            location=location
        )
        data_rows.append(data_row)
    
    # print(data_rows)
    print ("Generating dataset in LabelBox")

    dataset: Dataset = create_new_dataset(labelbox_client, dataset_name=location)
    print(f"Creating {len(data_rows)} data rows in dataset {dataset.name}")
    task = dataset.create_data_rows(data_rows)
    task.wait_till_done()
    if task.errors:
        print(f"Finished with error: {task.errors}")
    else:
        print(f"Finished without error.")

for location in label_locations_to_upload:
    print(f"--- Location: {location}")
    create_labelbox_dataset_for_location(location, folder_name=SUBFOLDER, key_postfix="median")

--- Location: Sutlej_Rupnagar_76-41_30-98_median
--- Folder: labels/Sutlej_Rupnagar_76-41_30-98_median/rgb
Sutlej_Rupnagar_76-41_30-98_2022-11-01_rgb_median
Generating dataset in LabelBox
Creating 1 data rows in dataset Sutlej_Rupnagar_76-41_30-98_median
Finished without error.
--- Location: Yamuna_Fatehpur_81-1_25-48_median
--- Folder: labels/Yamuna_Fatehpur_81-1_25-48_median/rgb
Yamuna_Fatehpur_81-1_25-48_2022-05-01_rgb_median
Generating dataset in LabelBox
Creating 1 data rows in dataset Yamuna_Fatehpur_81-1_25-48_median
Finished without error.
--- Location: Yamuna_Fatehpur_80-8_25-66_median
--- Folder: labels/Yamuna_Fatehpur_80-8_25-66_median/rgb
Yamuna_Fatehpur_80-8_25-66_2021-04-01_rgb_median
Yamuna_Fatehpur_80-8_25-66_2023-02-01_rgb_median
Generating dataset in LabelBox
Creating 2 data rows in dataset Yamuna_Fatehpur_80-8_25-66_median
Finished without error.
--- Location: Yamuna_Fatehpur_81-44_25-31_median
--- Folder: labels/Yamuna_Fatehpur_81-44_25-31_median/rgb
Yamuna_Fatehpur

## Deleting unused datasets
Use this section to selectively delete datasets that are no longer needed.

**!USE WITH CAUTION!**

In [9]:
# datasets = list(labelbox_client.get_datasets(where=(Dataset.name=="Dummy dataset")))
datasets = list(labelbox_client.get_datasets())
# check get_datasets for anything that has 'median' in it
datasets = [ds for ds in datasets if 'median' in ds.name or 'test' in ds.name]

for ds in datasets:
    print("Deleting: ", ds.name, ds.uid)
    #Uncomment to actually delete!
    # ds.delete()

Deleting:  Damodar_PurbaBardhaman_86-93_22-88_median clmjimlij0fwo07ya4xpo6tsa


## Adding datarow to an existing dataset in Labelbox

In [44]:
bucket = gcp_client.bucket(BUCKET_NAME)

def create_labelbox_datarow_for_location(location, dataset_name, key, folder_name=None):
    """
    location: string, name of the GCP top level folder
    dataset_name: LabelBox dataset object that you want to append to
    key: string, name of the subfolder you want to insert into Labelbox
    folder_name: string, name of the GCP subfolder
    """

    if check_if_dataset_exists (labelbox_client, dataset_name):
        # datasets = labelbox_client.get_datasets(
        # where=(Dataset.name == dataset)
        # )
        # dataset = list(datasets)[0]
        dataset = labelbox_client.get_datasets(where=Dataset.name == dataset_name).get_one()
        print ("Found dataset: ", dataset.name, dataset.uid)
    else:
        raise Exception("Dataset does not exist")


    if folder_name is None:
        folder_name = f"{location}/rgb"
    else:
        folder_name = f"{folder_name}/{location}/rgb"

    print(f"--- Folder:", folder_name)

    rgb_image_blobs = bucket.list_blobs(prefix=folder_name)

    data_rows = []
    for rgb_image_blob in rgb_image_blobs:
        public_image_url = gcp_utils.get_public_url(rgb_image_blob.name, bucket_name=BUCKET_NAME)
        # print(public_image_url)
        global_key = rgb_image_blob.name.split('/')[-1]
        #remove the .tif from the end of global_key
        global_key = global_key.split('.')[0] 


        if key != global_key:
            continue

        print("Found!: ", global_key)

        data_row = create_data_row_dict(
            img_url=public_image_url,
            global_key=global_key + '_median'
        )
        data_rows.append(data_row)
    
        # print(data_rows)
        print ("Generating datarow in dataset")
        dataset.create_data_rows(data_rows)

        return


--- Location: Betwa_Jalaun_79-79_25-89_median
Found dataset:  Betwa_Jalaun_79-79_25-89_median clmgrlbej0l55073h640f6fsc
--- Folder: labels/Betwa_Jalaun_79-79_25-89_median/rgb
Found!:  Betwa_Jalaun_79-79_25-89_2021-12-01_rgb
[{'row_data': {'tile_layer_url': 'https://storage.googleapis.com/sand_mining_median/labels/Betwa_Jalaun_79-79_25-89_median/rgb/Betwa_Jalaun_79-79_25-89_2021-12-01_rgb.tif', 'epsg': 'EPSG4326', 'name': 'RGB', 'min_zoom': 4, 'max_zoom': 20, 'alternative_layers': [{'tile_layer_url': 'https://api.mapbox.com/styles/v1/mapbox/satellite-v9/tiles/{z}/{x}/{y}?access_token=pk.eyJ1IjoiYW5kby1zaGFoIiwiYSI6ImNsanFmNDJiYzA1ZHYzaW5nazY4YWU3cDEifQ.mABfTStcXV1cMQyAna8_gQ', 'name': 'Hi-res Guidance'}]}, 'global_key': 'Betwa_Jalaun_79-79_25-89_2021-12-01_rgb_median', 'media_type': 'TMS_GEO', 'metadata_fields': [{'name': 'imageDateS2', 'value': datetime.datetime(2021, 12, 1, 0, 0)}]}]
Generating dataset in LabelBox


In [51]:
label_locations_to_upload =  [{"Sone_Patna_84-0_24-62_median":"Sone_Patna_84-0_24-62_2020-04-01_rgb"}]
for location in label_locations_to_upload:
    print(f"--- Location: {location}")
    #get the key and value
    location_name = list(location.keys())[0]
    datarow = list(location.values())[0]

    print (location_name, datarow)
    create_labelbox_datarow_for_location(location=location_name, dataset_name=location_name, key=datarow, folder_name=SUBFOLDER)

--- Location: {'Sone_Patna_84-0_24-62_median': 'Sone_Patna_84-0_24-62_2020-04-01_rgb'}
Sone_Patna_84-0_24-62_median Sone_Patna_84-0_24-62_2020-04-01_rgb
Found dataset:  Sone_Patna_84-0_24-62_median clmjimu9d03mo08wygx5saddn
--- Folder: labels/Sone_Patna_84-0_24-62_median/rgb
Found!:  Sone_Patna_84-0_24-62_2020-04-01_rgb
[{'row_data': {'tile_layer_url': 'https://storage.googleapis.com/sand_mining_median/labels/Sone_Patna_84-0_24-62_median/rgb/Sone_Patna_84-0_24-62_2020-04-01_rgb.tif', 'epsg': 'EPSG4326', 'name': 'RGB', 'min_zoom': 4, 'max_zoom': 20, 'alternative_layers': [{'tile_layer_url': 'https://api.mapbox.com/styles/v1/mapbox/satellite-v9/tiles/{z}/{x}/{y}?access_token=pk.eyJ1IjoiYW5kby1zaGFoIiwiYSI6ImNsanFmNDJiYzA1ZHYzaW5nazY4YWU3cDEifQ.mABfTStcXV1cMQyAna8_gQ', 'name': 'Hi-res Guidance'}]}, 'global_key': 'Sone_Patna_84-0_24-62_2020-04-01_rgb_median', 'media_type': 'TMS_GEO', 'metadata_fields': [{'name': 'imageDateS2', 'value': datetime.datetime(2020, 4, 1, 0, 0)}]}]
Generating dat