In [2]:
from google.cloud import storage
import pandas as pd
from yolov5 import val
import sys
import yaml
import os
import shutil
import reverse_geocoder as rg

# Generate Object Detection Dataset

**Author:** Madhava Paliyam (madhavapaliyam@gmail.com)

**Description:** Analyzes the model output using the YOLOv5 validation script. Breaks down model performance by crop type and region. 



**Inputs**: 

**Outputs**: 

This is done through the following steps: 

1. Get list of images that were used by using yaml config file
2. Run YOLOV5 validation script on train/validation splits store to txt file
3. From txt extract images and fix paths
4. Access original images and get locations of images 
5. Split results by region, and then run the validation script on each region. 

In [1]:
#### SET PARAMETERS HERE #####

# save output images here
SAVE_FOLDER = 'runs/val'

# exp name 
EXP_NAME = 'all'

# yaml file describing data for trained model
folder = os.path.abspath('dataset/')
yaml_file = os.path.join(folder, 'data_info.yaml')

# path to model weights 
model_weights_path = '/gpfs/data1/cmongp1/mpaliyam/street2sat/yolov5/runs/train/exp18/weights/best.pt'  

# confidence threshold 
conf_thresh = .0001

# IOU threshold
iou_thresh = .1


### Runs validation script with parameters

https://github.com/ultralytics/yolov5/blob/63ddb6f0d06f6309aa42bababd08c859197a27af/val.py#L319


In [4]:
# runs the validate script from yolov5 libary
to_parse = f"val.py --data {yaml_file} " \
            + f"--weights {model_weights_path} " \
            + f"--batch-size {8} " \
            + f"--imgsz {800} " \
            + f"--conf-thres {conf_thresh} " \
            + f"--iou-thres {iou_thresh} " \
            + f"--verbose " \
            + f"--exist-ok " \
            + f"--task val"\

to_parse = to_parse.split()
sys.argv = to_parse 
val.main()

YOLOv5 🚀 2022-2-16 torch 1.10.2+cu102 CUDA:0 (Tesla V100-PCIE-16GB, 16160.5MB)



[34m[1mval: [0mdata=/gpfs/data1/cmongp1/mpaliyam/street2sat/data/DATASET_2022-02-28_01:13:06_474346/data_info.yaml, weights=['/gpfs/data1/cmongp1/mpaliyam/street2sat/yolov5/runs/train/exp18/weights/best.pt'], batch_size=25, imgsz=1000, conf_thres=0.0001, iou_thres=0.1, task=val, device=, single_cls=False, augment=False, verbose=True, save_txt=False, save_hybrid=False, save_conf=False, save_json=False, project=runs/val, name=all, exist_ok=True, half=False


Fusing layers... 
Model Summary: 369 layers, 20919810 parameters, 0 gradients, 48.2 GFLOPs




[34m[1mval: [0mScanning '/gpfs/data1/cmongp1/mpaliyam/street2sat/data/DATASET_2022-02-28_01:13:06_474346/val/labels.cache' images and labels... 72 found, 0 missing, 35 empty, 0 corrupted: 100%|██████████| 72/72 [00:00<?, ?it/s]
               Class     Images     Labels          P          R     mAP@.5 mAP@.5:.95: 100%|██████████| 3/3 [00:09<00:00,  3.24s/it]


                 all         72        120      0.444      0.777      0.632        0.3
              banana         72         10      0.631        0.8      0.791      0.389
               maize         72         99      0.426      0.737      0.563      0.225
           sugarcane         72          4      0.413          1      0.995      0.534
             soybean         72          7      0.305      0.571      0.177     0.0521
Speed: 0.3ms pre-process, 12.0ms inference, 2.0ms NMS per image at shape (25, 3, 1024, 1024)
Results saved to [1mruns/val/all[0m


####

In [None]:
train_set = pd.read_csv('../data/train.csv')
val_set = pd.read_csv('../data/val.csv')

##### Goes through each country and copies images from validation set that match the country into a new folder

The new folders will be in the same directory as the dataset and will have the format [Country code] _ ANALYSIS _ [Dataset name]. There will also be a new yaml file associated with each new dataset. 

In [6]:
# currently supported countries 
country_list = ['UG', 'KE', 'US']
new_country_yaml_files = []

def ignore_files(dir, files):
    return [f for f in files if os.path.isfile(os.path.join(dir, f))]

# goes through each country and makes a yolov5 compatible dataset by copying the correct files
for country in country_list:
    # selects only images from validation in the country 
    df_country = df[df['cc'] == country]
    print(df_country.size)
    # new dataset name for each country 
    new_dataset_name = country + '_ANALYSIS_' + dataset_name

    config_val = config['val']
    new_config_val = config_val.replace(dataset_name, new_dataset_name)

    # copy directory structure 
    shutil.copytree(os.path.join(folder, dataset_name), os.path.join(folder,new_dataset_name), ignore=ignore_files)
    
    # for each validation image in the dataframe 
    for i,f in df_country.iterrows():
        old_path_img = os.path.join(config_val, f['path'].replace("/", "*$"))
        new_path_img = os.path.join(new_config_val, f['path'].replace("/", "*$"))
        shutil.copy(old_path_img, new_path_img)

        old_path_lab = os.path.join(config_val.replace('images', 'labels'), f['path'].replace("/", "*$").replace('.JPG', '.txt'))
        new_path_lab = os.path.join(new_config_val.replace('images', 'labels'), f['path'].replace("/", "*$").replace('.JPG', '.txt'))
        shutil.copy(old_path_lab, new_path_lab)

    # create new yaml file for the new country dataset 
    new_yaml_file = yaml_file.replace('data_info', f'data_info_{country}').replace(dataset_name, new_dataset_name)
    new_config = config.copy()
    # train is not used at all since there will be no training images copied, only validation 
    new_config['train'] = os.path.join(folder, new_dataset_name, 'train', 'images')
    new_config['val'] = os.path.join(folder, new_dataset_name, 'val', 'images')
    with open(new_yaml_file, 'w') as outfile:
        yaml.dump(new_config, outfile, default_flow_style=False)

    # keep track of new yaml files 
    new_country_yaml_files.append(new_yaml_file)


280
126
98


In [7]:
new_country_yaml_files

['/gpfs/data1/cmongp1/mpaliyam/street2sat/data/UG_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/data_info_UG.yaml',
 '/gpfs/data1/cmongp1/mpaliyam/street2sat/data/KE_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/data_info_KE.yaml',
 '/gpfs/data1/cmongp1/mpaliyam/street2sat/data/US_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/data_info_US.yaml']

##### For each new dataset created above, it runs the validation script. This way the per country detection scores can be seen. 

In [8]:
# runs validation script on each of the country
for country_yaml_file in new_country_yaml_files:
    print(f"\n\n\n\nRUNING VALIDATION SCRIPT: {country_yaml_file}\n\n\n")
    # gets the country code
    exp_name = country_yaml_file.split('_')[0][-2:]
    # runs the validate script from yolov5 libary
    to_parse = f"val.py --data {country_yaml_file} " \
                + f"--weights {model_weights_path} " \
                + f"--batch-size {batch_size} " \
                + f"--imgsz {img_size} " \
                + f"--conf-thres {conf_thresh} " \
                + f"--iou-thres {iou_thresh} " \
                + f"--project {save_folder} "\
                + f"--name {exp_name} "\
                + f"--verbose " \
                + f"--exist-ok " \
                + f"--task val"\

    to_parse = to_parse.split()
    sys.argv = to_parse 
    val.main()

    print("------------------------------------------------------------------------")


YOLOv5 🚀 2022-2-16 torch 1.10.2+cu102 CUDA:0 (Tesla V100-PCIE-16GB, 16160.5MB)







RUNING VALIDATION SCRIPT: /gpfs/data1/cmongp1/mpaliyam/street2sat/data/UG_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/data_info_UG.yaml



[34m[1mval: [0mdata=/gpfs/data1/cmongp1/mpaliyam/street2sat/data/UG_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/data_info_UG.yaml, weights=['/gpfs/data1/cmongp1/mpaliyam/street2sat/yolov5/runs/train/exp18/weights/best.pt'], batch_size=25, imgsz=1000, conf_thres=0.0001, iou_thres=0.1, task=val, device=, single_cls=False, augment=False, verbose=True, save_txt=False, save_hybrid=False, save_conf=False, save_json=False, project=runs/val, name=UG, exist_ok=True, half=False


Fusing layers... 
Model Summary: 369 layers, 20919810 parameters, 0 gradients, 48.2 GFLOPs




[34m[1mval: [0mScanning '/gpfs/data1/cmongp1/mpaliyam/street2sat/data/UG_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/val/labels' images and labels...40 found, 0 missing, 22 empty, 0 corrupted: 100%|██████████| 40/40 [00:00<00:00, 767.87it/s]
[34m[1mval: [0mNew cache created: /gpfs/data1/cmongp1/mpaliyam/street2sat/data/UG_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/val/labels.cache
               Class     Images     Labels          P          R     mAP@.5 mAP@.5:.95: 100%|██████████| 2/2 [00:08<00:00,  4.02s/it]


                 all         40         60      0.702       0.78      0.744      0.332
              banana         40         10      0.738        0.8      0.792       0.39
               maize         40         50      0.665       0.76      0.696      0.274
Speed: 0.3ms pre-process, 12.0ms inference, 1.7ms NMS per image at shape (25, 3, 1024, 1024)
Results saved to [1mruns/val/UG[0m
------------------------------------------------------------------------




RUNING VALIDATION SCRIPT: /gpfs/data1/cmongp1/mpaliyam/street2sat/data/KE_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/data_info_KE.yaml



[34m[1mval: [0mdata=/gpfs/data1/cmongp1/mpaliyam/street2sat/data/KE_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/data_info_KE.yaml, weights=['/gpfs/data1/cmongp1/mpaliyam/street2sat/yolov5/runs/train/exp18/weights/best.pt'], batch_size=25, imgsz=1000, conf_thres=0.0001, iou_thres=0.1, task=val, device=, single_cls=False, augment=False, verbose=True, save_txt=False, save_hybrid=False, save

YOLOv5 🚀 2022-2-16 torch 1.10.2+cu102 CUDA:0 (Tesla V100-PCIE-16GB, 16160.5MB)

Fusing layers... 
Model Summary: 369 layers, 20919810 parameters, 0 gradients, 48.2 GFLOPs




[34m[1mval: [0mScanning '/gpfs/data1/cmongp1/mpaliyam/street2sat/data/KE_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/val/labels' images and labels...18 found, 0 missing, 5 empty, 0 corrupted: 100%|██████████| 18/18 [00:00<00:00, 1537.44it/s]
[34m[1mval: [0mNew cache created: /gpfs/data1/cmongp1/mpaliyam/street2sat/data/KE_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/val/labels.cache
               Class     Images     Labels          P          R     mAP@.5 mAP@.5:.95: 100%|██████████| 1/1 [00:03<00:00,  3.03s/it]


                 all         18         35       0.41      0.656      0.411      0.164
               maize         18         35       0.41      0.656      0.411      0.164
Speed: 0.3ms pre-process, 12.2ms inference, 1.7ms NMS per image at shape (25, 3, 1024, 1024)
Results saved to [1mruns/val/KE[0m


YOLOv5 🚀 2022-2-16 torch 1.10.2+cu102 CUDA:0 (Tesla V100-PCIE-16GB, 16160.5MB)



------------------------------------------------------------------------




RUNING VALIDATION SCRIPT: /gpfs/data1/cmongp1/mpaliyam/street2sat/data/US_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/data_info_US.yaml



[34m[1mval: [0mdata=/gpfs/data1/cmongp1/mpaliyam/street2sat/data/US_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/data_info_US.yaml, weights=['/gpfs/data1/cmongp1/mpaliyam/street2sat/yolov5/runs/train/exp18/weights/best.pt'], batch_size=25, imgsz=1000, conf_thres=0.0001, iou_thres=0.1, task=val, device=, single_cls=False, augment=False, verbose=True, save_txt=False, save_hybrid=False, save_conf=False, save_json=False, project=runs/val, name=US, exist_ok=True, half=False


Fusing layers... 
Model Summary: 369 layers, 20919810 parameters, 0 gradients, 48.2 GFLOPs




[34m[1mval: [0mScanning '/gpfs/data1/cmongp1/mpaliyam/street2sat/data/US_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/val/labels' images and labels...14 found, 0 missing, 8 empty, 0 corrupted: 100%|██████████| 14/14 [00:00<00:00, 67.64it/s]
[34m[1mval: [0mNew cache created: /gpfs/data1/cmongp1/mpaliyam/street2sat/data/US_ANALYSIS_DATASET_2022-02-28_01:13:06_474346/val/labels.cache
               Class     Images     Labels          P          R     mAP@.5 mAP@.5:.95: 100%|██████████| 1/1 [00:03<00:00,  3.92s/it]


                 all         14         25      0.434      0.786      0.499      0.255
               maize         14         14      0.303      0.786      0.322      0.179
           sugarcane         14          4      0.694          1      0.995      0.534
             soybean         14          7      0.305      0.571      0.178     0.0526
Speed: 0.3ms pre-process, 13.1ms inference, 4.2ms NMS per image at shape (25, 3, 1024, 1024)
Results saved to [1mruns/val/US[0m
------------------------------------------------------------------------
