# Setup
This jupyter file sets up the environment for YOLO and grabs the dataset(s) from labelbox.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/Theia
!pip install -r requirements.txt
import json
import matplotlib.pyplot as plt
import os
from pathlib import Path
import requests
import yaml
from PIL import Image
from tqdm import tqdm
import labelbox as lb
import random
import io
from datetime import datetime

In [None]:
%cd drive/MyDrive/Theia
if not os.path.exists("yolov7"):
    !pip install -r requirements.txt
    !git clone https://github.com/WongKinYiu/yolov7
    %cd yolov7
    !pip install -r requirements.txt
    %cd ..

In [4]:
# Fill in these strings with your api key and the id of the project (dataset) you want to get from labelbox
LABELBOX_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySWQiOiJja3NpZDJzcWwwZTZiMHlkNDVvOGkxNGkzIiwib3JnYW5pemF0aW9uSWQiOiJja3M5aGdlMzMxNHh6MHlkMzJ1NHVmMHo4IiwiYXBpS2V5SWQiOiJjbGowM3ljcDAwMnVyMDd3dWNvaDA2dnVzIiwic2VjcmV0IjoiOWEzZDNhODMwYWU3MzUzNGZjZGU1OTJjMzNlNGVhYWUiLCJpYXQiOjE2ODcwMTI5NTIsImV4cCI6MjMxODE2NDk1Mn0.rS-m_n-2HgwfDypIup3x0Z9hnZYvZQjyG7WMr435rrI"
LABELBOX_PROJECT_ID = ""
CLASSES = ['car', 'pickup_truck', 'bus', 'van', 'tow_truck', 'semi_truck', 'box_truck', 'dump_truck', 'construction']

In [9]:
def create_yolo_dataset_structure(root : Path):
    """
    Creates the file structure needed for yolo, starting at location root
    """
    Path(f"{root}/images/train").mkdir(parents=True, exist_ok=True)
    Path(f"{root}/labels/train").mkdir(parents=True, exist_ok=True)
    Path(f"{root}/images/val").mkdir(parents=True, exist_ok=True)
    Path(f"{root}/labels/val").mkdir(parents=True, exist_ok=True)
    Path(f"{root}/images/test").mkdir(parents=True, exist_ok=True)
    Path(f"{root}/labels/test").mkdir(parents=True, exist_ok=True)


def split_indices(number : int, split = [0.7, 0.15, 0.15]):
    """
    creates very roughly splits of the datarows for train, val, and test

    number : number of data points
    split : the desired train/val/test split
    """
    num_train = int(number * split[0])
    num_val = int(number * split[1])
    num_test = number - num_train - num_val
    result = ["train"] * num_train + ["val"] * num_val + ["test"] * num_test
    random.seed(0)
    random.shuffle(result)

    return result


def write_yaml(name : str, location : Path, data : dict):
    name = name + ".yaml"
    with open(location / name, 'w') as f:
        yaml.dump(data, f)
    print(f"Data was written to file {name}")


def load_labelbox_dataset(key : str, id : str):
    """
    Loads a dataset of images and labels from the labelbox API in json format

    key : labelbox API key
    id : project ID

    returns : labelbox json
    """
    client = lb.Client(api_key=key)
    dataset = client.get_project(id)
    export_task = dataset.export_labels(download = True)

    return export_task



def labelbox_to_yolo(classes : list, root_dir : Path, file, split = [0.7, 0.15, 0.15], zip=False):
    """
    Converts labelbox json (images, labels) to yolo format and places them in pre-existing yolo file structure located at root

    classes : list of class names e.g. ['car', 'truck', 'bus']
    root_dir : path to directory containing yolo train/ val/ and test/ directories
    file : list

    """
    root_dir = Path(root_dir)
    error_path = root_dir / "errors"
    if not os.path.exists(error_path):
        os.mkdir(error_path)
    data = file
    # this is a very stupid way of doing it but I just want something easy that I know will work (randomly split into train/val/test)
    num_datapoints = len(data)
    destinations = split_indices(num_datapoints, split)
    i = 0
    for img in tqdm(data, desc=f'Converting'):
        destination = destinations[i]
        i += 1

        im_path = img['Labeled Data']
        tmp = requests.get(im_path, stream=True).raw if im_path.startswith('http') else im_path
        im = Image.open(io.BytesIO(tmp.data))
        # fixing a bug
        if im.format.lower() not in ["jpeg", "png"]:
            continue
        if im.mode in ("RGBA", "P"):
            im = im.convert("RGB")
        width, height = im.size
        label_path = root_dir / 'labels' / destination / Path(img['External ID']).with_suffix('.txt').name
        image_path = root_dir / 'images' / destination / img['External ID']

        try:
            im.save(image_path, quality=95, subsampling=0)
            for label in img['Label']['objects']:
                top, left, h, w = label['bbox'].values()
                xywh = [(left + w / 2) / width, (top + h / 2) / height, w / width, h / height]  # xywh normalized

                try:
                    cls = label['classifications'][0]["answer"]["value"]
                    if cls == "tow_truck" or cls == "dump_truck":
                        cls = "construction"
                    line = classes.index(cls), *xywh  # YOLO format (class_index, xywh)
                    with open(label_path, 'a') as f:
                        f.write(('%g ' * len(line)).rstrip() % line + '\n')
                except:
                    print(f"Moving image to error folder")
                    if image_path.is_file():
                        os.replace(image_path, error_path / img['External ID'])
                    if label_path.is_file():
                        os.replace(label_path, error_path / Path(img['External ID']).with_suffix('.txt').name)
                    pass

        except ValueError:
            print(f"Unknown error, skipping image")
            pass

    if zip:
        print(f'Zipping as {root_dir}.zip...')
        os.system(f'zip -qr {root_dir}.zip {root_dir}')


def plot_classes(classes, p):
    """
    creates a nice histogram for classes, just to verify that everything looks okay
    """
    date = datetime.now().strftime("%m/%d/%Y")
    class_dict = {}
    for c in classes:
        class_dict[c] = 0

    # now go through all directories and count each label
    paths = [p / "train", p / "val", p / "test"]
    for pth in paths:
        for file in os.listdir(pth):
            with open(pth / file, "r") as f:
                lines = f.readlines()
            for line in lines:
                label = int(line[:line.index(' ')])
                class_dict[classes[label]] += 1

    names = list(class_dict.keys())
    values = list(class_dict.values())
    print(class_dict)
    plt.ylabel("Frequency")
    #plt.yscale("log")
    plt.bar(range(len(names)), values, tick_label=names)
    plt.xticks(rotation=45)
    plt.title(f"Vehicle counts in data {date}")

def convert_labels(path : Path, new_id : int, old_ids : list):
    """
    remaps any label in old_ids to have the label new_id
    """
    paths = [path / "train", path / "val", path / "test"]
    for pth in paths:
        for file in os.listdir(pth):
            with open(pth / file, "r") as f:
                lines = f.readlines()

            modified_lines = [line.replace(line[0], str(new_id), 1) if int(line[0]) in old_ids else line for line in lines]
            with open(pth / file, 'w') as f:
                f.writelines(modified_lines)


In [None]:
"""
This cell creates a dataset and configuration to train yolo
"""
assert os.getcwd() == "/content/drive/MyDrive/Theia"
# Create file structure
DATASET = "final_dataset"
if not os.path.exists(DATASET):
    os.mkdir(DATASET)
yolo_dataset = Path(f"./{DATASET}")
create_yolo_dataset_structure(yolo_dataset)

# "clnxifdc90l7m070a4pbfd3lg" was problematic, removed from the dataset
non_highway_projects = ["clp8tmwow000u072e44vc6wgy", "clp8r7xgk05yw07yc9fxy6k7j", "clpbdh8uk07b30728alhnaw8k", "clo3f3ipx00ih07wt4ldy2lil"]
highway_projects = ["clo0ma2wv09fl07wt2z19cwk0", "clm9p2zp7000r07zicrgv5fvc", "clmxqu49m0lc60708g90tc727"]

for project in non_highway_projects:
    dataset = load_labelbox_dataset(LABELBOX_API_KEY, project)
    # Convert to YOLO format, put all data into train
    labelbox_to_yolo(CLASSES, yolo_dataset, dataset, split=[1.0, 0.0, 0.0])

for project in highway_projects:
    dataset = load_labelbox_dataset(LABELBOX_API_KEY, project)
    # Convert to YOLO format, put 70% train 20% val 10% test
    labelbox_to_yolo(CLASSES, yolo_dataset, dataset, split=[0.7, 0.2, 0.1])

plot_classes(CLASSES, Path(f"./{DATASET}/labels"))