In [2]:
from google.cloud import firestore
from google.cloud import storage, aiplatform
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from yolov5 import detect
import sys
import yaml
import os
import shutil
import reverse_geocoder as rg
import random
from tqdm import tqdm
import collections
from datetime import datetime

# might be useful for changing all JPG to jpg
# for x in `du -a | cut -d '/' -f2- | grep JPG`; do mv "$x" "${x%.JPG}.jpg"; done

# Initialize connections to cloud storage and database
# !gcloud auth login
# !gcloud config set account <>
# !gcloud config set project bsos-geog-harvest1
# !gcloud auth application-default login

# Create Dataset Based On Crop And Region

**Author:** Madhava Paliyam (madhavapaliyam@gmail.com)

**Description:** Creates a dataset based on a crop or region that we want to improve in 



**Inputs**: Parameters for yolov5 detection, region to query, crop to query

**Outputs**: A dataset onto gs://street2sat-gcloud-labeling bucket with dataset and AI platform dataset. 

In [22]:
##### SET PARAMETERS HERE #####

query_for_crop = 'banana' # choose from any of the available crops
query_for_region = 'KE' # choose from country code : 'KE', 'UG', 'US'
num_of_images = 250 # num images to search 


# yolov5 Parameters: 
# yaml file describing data for trained model
folder = '/gpfs/data1/cmongp1/mpaliyam/street2sat/data'
dataset_name = 'DATASET_2022-02-28_01:13:06_474346'
yaml_file = os.path.join(folder, dataset_name, 'data_info.yaml')
weights= '/gpfs/data1/cmongp1/mpaliyam/street2sat/yolov5/runs/train/exp18/weights/best.pt'   # model.pt path(s)

data = yaml_file # dataset.yaml path
imgsz= 640  # inference size (height, width)
conf_thres = 0.05  # confidence threshold
iou_thres = 0.05  # NMS IOU threshold
max_det = 1000  # maximum detections per image
project = 'runs/detect'  # save results to project/name
name ='exp'  # save results to project/name
save_dir = os.path.join(f'crop_{query_for_crop}_region_{query_for_region}') # downloaded images will be saved to save_dir 

!dvc pull -q -f 

[0m

In [23]:
# read csv 
all_paths = pd.read_csv('gs://street2sat-database-csv/database-info.csv')
# we want images not already being labeled
all_paths = all_paths[all_paths['being_labeled'] == False]
# and also images that are not in test set 
all_paths = all_paths[all_paths['test_set'] == False]


  exec(code_obj, self.user_global_ns, self.user_ns)


In [24]:
# randomly sample images within country
potential_images = all_paths[all_paths['cc'] == query_for_region].sample(num_of_images)
potential_images.head()

Unnamed: 0.1,Unnamed: 0,input_img,latitude,longitude,being_labeled,country,admin1,admin2,cc,location,test_set,time,focal_length,pixel_height
53235,53235,gs://street2sat-uploaded/KENYA/2021-08-03-T1/G...,-1.084484,35.913396,False,KENYA,Narok,,KE,Narok,False,2021-08-03 15:37:08+00:00,3,2028
106114,106114,gs://street2sat-uploaded/KENYA/2021_07_18_T2/1...,0.676725,34.300967,False,KENYA,Busia,,KE,Malaba,False,2021-07-18 14:34:22+00:00,3,2028
114165,114165,gs://street2sat-uploaded/KENYA/2021_07_19_T2/1...,0.090078,34.639646,False,KENYA,Siaya,,KE,Yala,False,2021-07-19 15:33:00+00:00,3,2028
38475,38475,gs://street2sat-uploaded/KENYA/2021-07-27-T1/G...,-0.593231,34.553229,False,KENYA,Homa Bay,,KE,Homa Bay,False,2021-07-27 14:01:37+00:00,3,2028
89002,89002,gs://street2sat-uploaded/KENYA/2021_07_14_T2/1...,0.831008,35.002379,False,KENYA,Trans Nzoia,,KE,Kitale,False,2021-07-14 10:57:37+00:00,3,2028


In [25]:
# open connection to google cloud 
client = storage.Client()
gcloud_labeling_bucket_str = 'street2sat-gcloud-labeling'
gcloud_uploaded_bucket_str = 'street2sat-uploaded'

region_to_folder = {'KE': 'KENYA', 'US': 'USA', 'UG': 'Uganda', '':''}

gcloud_uploaded_bucket = client.bucket(gcloud_uploaded_bucket_str)


classes_dict = {}
with open('../street2sat_utils/crop_info/classes.txt') as classes_file: 
    for i, line in enumerate(classes_file):
        classes_dict[line.strip()] = i

In [26]:
# create directory to save images to 
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
# download images 
for i,item in potential_images.iterrows(): 
    blob = gcloud_uploaded_bucket.blob(item['input_img'].replace('gs://street2sat-uploaded/', ''))
    blob.download_to_filename(os.path.join(save_dir, f"{i}.jpg"))
    

#### Run detect.py 

Configure conf_thres and iou_thres to adjust the amount/quality of bounding boxes returned. 

In [27]:
conf_thres = conf_thres
iou_thres = iou_thres

# runs the detect script from yolov5 libary
to_parse = f"detect.py " \
            + f"--weights {weights} " \
            + f"--source {save_dir} " \
            + f"--imgsz {imgsz} " \
            + f"--conf-thres {conf_thres} " \
            + f"--iou-thres {iou_thres} " \
            + f"--classes {classes_dict[query_for_crop]} " \
            + f"--max-det {max_det} " \
            + f"--project {project} " \
            + f"--name {name} " \
            + f"--exist-ok " \
            + f"--save-crop " \
            + f"--save-txt " \
            + f"--save-conf " \

to_parse = to_parse.split()
sys.argv = to_parse 
detect.main()

[34m[1mdetect: [0mweights=['/gpfs/data1/cmongp1/mpaliyam/street2sat/yolov5/runs/train/exp18/weights/best.pt'], source=crop_banana_region_KE, imgsz=[640, 640], conf_thres=0.05, iou_thres=0.05, max_det=1000, device=, view_img=False, save_txt=True, save_conf=True, save_crop=True, nosave=False, classes=[2], agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=exp, exist_ok=True, line_thickness=3, hide_labels=False, hide_conf=False, half=False, dnn=False
INFO:yolov5.utils.torch_utils:YOLOv5 🚀 2022-2-16 torch 1.10.2+cu102 CUDA:0 (Tesla V100-PCIE-16GB, 16160.5MB)

INFO:models.yolo:Fusing layers... 
INFO:yolov5.utils.torch_utils:Model Summary: 369 layers, 20919810 parameters, 0 gradients, 48.2 GFLOPs
image 1/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/street2sat/notebooks/crop_banana_region_KE/100073.jpg: 480x640 Done. (0.011s)
image 2/250 /gpfs/data1/cmongp1/mpaliyam/street2sat/street2sat/notebooks/crop_banana_region_KE/100652.jpg: 480x640 Done. (0.01

In [29]:
# find images that have detections 
images_with_labels = [int(x.replace('.txt', '')) for x in os.listdir(os.path.join(project, name, 'labels'))]
potential_images.loc[images_with_labels]

Unnamed: 0.1,Unnamed: 0,input_img,latitude,longitude,being_labeled,country,admin1,admin2,cc,location,test_set,time,focal_length,pixel_height
61385,61385,gs://street2sat-uploaded/KENYA/2021_07_07_T2/G...,0.625876,35.312018,False,KENYA,Uasin Gishu,,KE,Eldoret,False,2021-07-07 09:03:03+00:00,3,2028
21251,21251,gs://street2sat-uploaded/KENYA/2021-07-17-T1/G...,-0.679134,34.723568,False,KENYA,Kisii,,KE,Kisii,False,2021-07-17 13:36:12+00:00,3,2028
26107,26107,gs://street2sat-uploaded/KENYA/2021-07-21-T1/G...,-0.756776,34.875893,False,KENYA,Nyamira District,,KE,Keroka,False,2021-07-21 14:01:31+00:00,3,2028
25418,25418,gs://street2sat-uploaded/KENYA/2021-07-21-T1/G...,-0.759381,34.87841,False,KENYA,Nyamira District,,KE,Keroka,False,2021-07-21 13:15:04+00:00,3,2028
79276,79276,gs://street2sat-uploaded/KENYA/2021_07_13_T2/1...,1.19097,34.997915,False,KENYA,West Pokot,,KE,Kapenguria,False,2021-07-13 10:45:13+00:00,3,2028
24019,24019,gs://street2sat-uploaded/KENYA/2021-07-18-T1/G...,-0.748092,34.782035,False,KENYA,Kisii,,KE,Kisii,False,2021-07-18 15:19:51+00:00,3,2028
64191,64191,gs://street2sat-uploaded/KENYA/2021_07_10_T2/1...,0.781988,34.565132,False,KENYA,,,KE,Malikisi,False,2021-07-10 11:46:00+00:00,3,2028
21068,21068,gs://street2sat-uploaded/KENYA/2021-07-17-T1/G...,-0.679199,34.731849,False,KENYA,Kisii,,KE,Kisii,False,2021-07-17 13:31:16+00:00,3,2028


In [8]:
# Generate CSV, find detected images in the directory 
# these contain the txt files of labels that detect.py found, if the class was not 
# present it will not be in this directory
images_with_labels = [int(x.replace('.txt', '')) for x in os.listdir(os.path.join(project, name, 'labels'))]

# filter the potential images to label by the ones which were detected 
images_of_interest = potential_images.loc[images_with_labels]

print(f"Found {len(images_of_interest)} images")

# create csv with each row having the image path from potential images 
amount_of_csvs_to_generate = 1
for i in tqdm(range(amount_of_csvs_to_generate), desc="CSV Generation"):
    images_to_label = images_of_interest
    csv_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S_") + f'crop_{query_for_crop}_region_{query_for_region}' + '.csv'
    print(f"Saving to {csv_name}")
    df = pd.DataFrame(images_to_label)
    df.to_csv(f"gs://{gcloud_labeling_bucket_str}/{csv_name}", sep="\n", index=False, header=False)

67


CSV Generation:   0%|          | 0/1 [00:00<?, ?it/s]

Saving to 2022-03-15_11-34-02_crop_banana_region_UG.csv


CSV Generation: 100%|██████████| 1/1 [00:00<00:00,  2.49it/s]


In [9]:
# location of csv in google cloud 
f"gs://{gcloud_labeling_bucket_str}/{csv_name}"

'gs://street2sat-gcloud-labeling/2022-03-15_11-34-02_crop_banana_region_UG.csv'

In [10]:
# create dataset 
ds = aiplatform.ImageDataset.create(
        display_name=csv_name.split('.')[0],
        gcs_source=f"gs://{gcloud_labeling_bucket_str}/{csv_name}",
        import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
        sync=False,
    )

INFO:google.cloud.aiplatform.datasets.dataset:Creating ImageDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create ImageDataset backing LRO: projects/1012768714927/locations/us-central1/datasets/9121113057425096704/operations/4717991819122573312
