In [1]:
from google.cloud import firestore
from google.cloud import storage, aiplatform
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from yolov5 import detect
import sys
import yaml
import os
import shutil
import reverse_geocoder as rg
import random
from tqdm import tqdm
import collections
from datetime import datetime

#### Connect to Gcloud

In [2]:
# Initialize connections to cloud storage and database
# !gcloud auth login
# !gcloud config set account <>
# !gcloud config set project bsos-geog-harvest1
# !gcloud auth application-default login
client = storage.Client()
db = firestore.Client()
gcloud_labeling_bucket_str = 'street2sat-gcloud-labeling'
gcloud_uploaded_bucket_str = 'street2sat-uploaded'
coll = db.collection("street2sat")

In [3]:
region_to_folder = {'KE': 'KENYA', 'US': 'USA', 'UG': 'Uganda', '':''}

gcloud_uploaded_bucket = client.bucket(gcloud_uploaded_bucket_str)


classes_dict = {}
with open('../street2sat_utils/crop_info/classes.txt') as classes_file: 
    for i, line in enumerate(classes_file):
        classes_dict[line.strip()] = i
classes_dict


{'tobacco': 0,
 'coffee': 1,
 'banana': 2,
 'tea': 3,
 'beans': 4,
 'maize': 5,
 'sorghum': 6,
 'millet': 7,
 'sweet_potatoes': 8,
 'cassava': 9,
 'rice': 10,
 'sugarcane': 11,
 'soybean': 12}

### Steps: 
1. Select if you want to query for crop, region, or both
    -  If querying for region, pull random images from the specified folder in street2sat uploaded bucket
    -  If querying for crop, pull random images from all of street2sat uploaded 
2. Check if image is already in street2sat data labeling bucket 
3. Save images to local directory 
4. Run detect.py on the images --> change conf_thres, and iou_thres to get a good number of detections
5. Analyze those dections and if they look good then create data labeling task with those images 

*** Note *** if rerunning to get new images, make sure you delete the dataset folder and the runs/detect/exp folder. 

In [4]:
query_for_crop = 'banana' # choose from any of the available crops
query_for_region = 'UG' # choose from country code find in split_labels.ipynb 
num_of_images = 250 # num images to search 


# yaml file describing data for trained model
folder = '/gpfs/data1/cmongp1/mpaliyam/street2sat/data'
dataset_name = 'DATASET_2022-02-28_01:13:06_474346'
yaml_file = os.path.join(folder, dataset_name, 'data_info.yaml')

weights= '/gpfs/data1/cmongp1/mpaliyam/street2sat/yolov5/runs/train/exp18/weights/best.pt'   # model.pt path(s)
source= '/gpfs/data1/cmongp1/mpaliyam/street2sat/data' # file/dir/URL/glob, 0 for webcam
data = yaml_file # dataset.yaml path
imgsz= 640  # inference size (height, width)
conf_thres = 0.05  # confidence threshold
iou_thres = 0.05  # NMS IOU threshold
max_det = 1000  # maximum detections per image
project = 'runs/detect'  # save results to project/name
name ='exp'  # save results to project/name



save_dir = os.path.join(source, f'crop_{query_for_crop}_region_{query_for_region}') # will be saved to save dir 

# Load in all available paths for given prefix, this will take 3 minutes or so
all_paths = [blob.name for blob in tqdm(client.list_blobs(gcloud_uploaded_bucket_str, prefix=f"{region_to_folder[query_for_region]}"))]  
random.shuffle(all_paths)



1278791it [04:16, 4982.19it/s] 


In [5]:
def get_images_already_being_labelled():
    """Gets images already labelled"""
    images_already_being_labelled = []
    csv_names = [blob.name for blob in client.list_blobs(gcloud_labeling_bucket_str, prefix="") if blob.name.endswith(".csv")]

    for csv_name in tqdm(csv_names, desc="Get already labelled"):
        uris = pd.read_csv(f"gs://{gcloud_labeling_bucket_str}/{csv_name}", header=None, sep="\n")[0]
        images_already_being_labelled += uris.to_list()

    # Ensure there are no duplicates in images already being labelled
    dupes = [item for item, count in collections.Counter(images_already_being_labelled).items() if count > 1]
    dupes.remove('0') # An index of 0 was erroneously output in previous csv
    assert len(dupes) == 0, "Found duplicates in images being labelled. One of the labeling tasks needs to be removed."
    return set(images_already_being_labelled)

# randomly sample images from all paths available
potential_images = random.sample(all_paths, num_of_images)
# filter by already labeled images 
already_labeled = get_images_already_being_labelled()
for image in potential_images: 
    if image in already_labeled: 
        potential_images.remove(image)
        print(f"{image} already being labeled, removing.")
        
str(len(potential_images)) + ' images left'

Get already labelled: 100%|██████████| 23/23 [00:08<00:00,  2.65it/s]


'250 images left'

In [6]:
# create directory to save images to 
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
# download images 
for image in potential_images: 
    blob = gcloud_uploaded_bucket.blob(image)
    blob.download_to_filename(os.path.join(save_dir, image.replace('/', '*$')))


#### Run detect.py 

Configure conf_thres and iou_thres here to adjust the amount/quality of bounding boxes returned. 

In [7]:
conf_thres = conf_thres
iou_thres = iou_thres

# runs the detect script from yolov5 libary
to_parse = f"detect.py " \
            + f"--weights {weights} " \
            + f"--source {save_dir} " \
            + f"--imgsz {imgsz} " \
            + f"--conf-thres {conf_thres} " \
            + f"--iou-thres {iou_thres} " \
            + f"--classes {classes_dict[query_for_crop]} " \
            + f"--max-det {max_det} " \
            + f"--project {project} " \
            + f"--name {name} " \
            + f"--exist-ok " \
            + f"--save-crop " \
            + f"--save-txt " \
            + f"--save-conf " \

to_parse = to_parse.split()
sys.argv = to_parse 
detect.main()

[34m[1mdetect: [0mweights=['/gpfs/data1/cmongp1/mpaliyam/street2sat/yolov5/runs/train/exp18/weights/best.pt'], source=/gpfs/data1/cmongp1/mpaliyam/street2sat/data/crop_banana_region_UG, imgsz=[640, 640], conf_thres=0.05, iou_thres=0.05, max_det=1000, device=, view_img=False, save_txt=True, save_conf=True, save_crop=True, nosave=False, classes=[2], agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=exp, exist_ok=True, line_thickness=3, hide_labels=False, hide_conf=False, half=False, dnn=False
INFO:yolov5.utils.torch_utils:YOLOv5 🚀 2022-2-16 torch 1.10.2+cu102 CUDA:0 (Tesla V100-PCIE-16GB, 16160.5MB)

INFO:models.yolo:Fusing layers... 
INFO:yolov5.utils.torch_utils:Model Summary: 369 layers, 20919810 parameters, 0 gradients, 48.2 GFLOPs


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


image 1/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/data/crop_banana_region_UG/Uganda*$1829244*$2020-06-21_Edrick_21_24*$104GOPRO*$G0048016.JPG: 480x640 Done. (0.012s)
image 2/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/data/crop_banana_region_UG/Uganda*$1829244*$2020-06-21_Edrick_21_24*$106GOPRO*$G0049536.JPG: 480x640 Done. (0.011s)
image 3/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/data/crop_banana_region_UG/Uganda*$1829244*$2020-06-21_Edrick_21_24*$114GOPRO*$G0077932.JPG: 480x640 3 bananas, Done. (0.012s)
image 4/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/data/crop_banana_region_UG/Uganda*$1829244*$2020-06-21_Edrick_21_24*$120GOPRO*$G0092633.JPG: 480x640 4 bananas, Done. (0.011s)
image 5/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/data/crop_banana_region_UG/Uganda*$1829244*$2020-06-21_Edrick_21_24*$122GOPRO*$G0094404.JPG: 480x640 2 bananas, Done. (0.011s)
image 6/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/data/crop_banana_region_UG/Uganda*$1829244*$2020-06-21_Edrick_21_24*$123GOP

In [8]:
# Generate CSV, find detected images in the directory and convert back to database format 
# these contain the txt files of labels that detect.py found, if the class was not 
# present it will not be in this directory
images_with_labels = [x.replace('.txt', '.JPG').replace('*$', '/') for x in os.listdir(os.path.join(project, name, 'labels'))]
images_with_labels = set(images_with_labels)

# filter the potential images to label by the ones which were detected 
potential_images = [image for image in potential_images if image in images_with_labels]

# fix the format to be google cloud paths
potential_images = [f"gs://{gcloud_uploaded_bucket_str}/" + image for image in potential_images]

print(len(potential_images))

# create csv with each row having the image path from potential images 
amount_of_csvs_to_generate = 1
for i in tqdm(range(amount_of_csvs_to_generate), desc="CSV Generation"):
    images_to_label = potential_images
    csv_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S_") + f'crop_{query_for_crop}_region_{query_for_region}' + '.csv'
    print(f"Saving to {csv_name}")
    df = pd.DataFrame(images_to_label)
    df.to_csv(f"gs://{gcloud_labeling_bucket_str}/{csv_name}", sep="\n", index=False, header=False)

67


CSV Generation:   0%|          | 0/1 [00:00<?, ?it/s]

Saving to 2022-03-15_11-34-02_crop_banana_region_UG.csv


CSV Generation: 100%|██████████| 1/1 [00:00<00:00,  2.49it/s]


In [9]:
# location of csv in google cloud 
f"gs://{gcloud_labeling_bucket_str}/{csv_name}"

'gs://street2sat-gcloud-labeling/2022-03-15_11-34-02_crop_banana_region_UG.csv'

In [10]:
# create dataset 
ds = aiplatform.ImageDataset.create(
        display_name=csv_name.split('.')[0],
        gcs_source=f"gs://{gcloud_labeling_bucket_str}/{csv_name}",
        import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
        sync=False,
    )

INFO:google.cloud.aiplatform.datasets.dataset:Creating ImageDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create ImageDataset backing LRO: projects/1012768714927/locations/us-central1/datasets/9121113057425096704/operations/4717991819122573312


In [11]:
ds._gsa_resource

AttributeError: 'ImageDataset' object has no attribute '_gsa_resource'

INFO:google.cloud.aiplatform.datasets.dataset:ImageDataset created. Resource name: projects/1012768714927/locations/us-central1/datasets/9121113057425096704
INFO:google.cloud.aiplatform.datasets.dataset:To use this ImageDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.ImageDataset('projects/1012768714927/locations/us-central1/datasets/9121113057425096704')
INFO:google.cloud.aiplatform.datasets.dataset:Importing ImageDataset data: projects/1012768714927/locations/us-central1/datasets/9121113057425096704
INFO:google.cloud.aiplatform.datasets.dataset:Import ImageDataset data backing LRO: projects/1012768714927/locations/us-central1/datasets/9121113057425096704/operations/9121386354784075776
INFO:google.cloud.aiplatform.datasets.dataset:ImageDataset data imported. Resource name: projects/1012768714927/locations/us-central1/datasets/9121113057425096704
