# Heridal Image Processing for Search And Rescue dataset

Follow this notebook to prepare VisDrone dataset.

**Description:**
- HERIDAL database contains approximately 500 labelled, full-size 4,000 x 3,000 pixel real-world images. At the moment 101 images have been selected for testing purposes. Furthermore, this dataset contains over 68,750 image patches of wilderness acquired from an aerial perspective, 29,050 positive samples containing person as well as 39,700 negative samples .

**Annotations:**
- in xml format:
```xml
<object>
  <name>human</name>
  <pose>unspecified</pose>
  <truncated>0</truncated>
  <difficult>0</difficult>
  <bndbox>
     <xmin>3471</xmin>
     <xmax>3540</xmax>
     <ymin>1195</ymin>
     <ymax>1275</ymax>
  </bndbox>
</object>
```
- bounding box in annotation in xyxy format

**Table of content:**

0. Init - imports and data download
1. Data annotation cleaning
2. Data transformation
3. Data visualization


## 0. Init - imports and data download
Download dataset .zip files and extract them to `data/source/Heridal` dir. After extract data should look like this:
```
data
└───source
    └───Heridal
        ├───patches
        │   ├───negative
        │   └───positive
        ├───testImages
        │   └───labels
        └───trainImages
            └───labels
```

Currently `Heridal/patches` subdir is not used.

In [None]:
# Uncomment below two lines to reload imported packages (in case of modifying them)
# %load_ext autoreload
# %autoreload 2

# Imports
import os
import random
import numpy as np
import pandas as pd
import shutil
import xmltodict
import json
import cv2
import pybboxes as pbx
from pathlib import Path

from prj_utils.consts import ROOT_DIR
from data_processing.image_processing import plot_xywhn_annotated_image_from_file, get_brightness_stats, copy_annotated_images, get_number_of_objects_stats

# Consts
TRAIN_DIR = f'{ROOT_DIR}/data/source/Heridal/trainImages'
TEST_DIR = f'{ROOT_DIR}/data/source/Heridal/testImages'

TRAIN_PROCESSED_DIR = f'{ROOT_DIR}/data/processed/Heridal/train'
VAL_PROCESSED_DIR = f'{ROOT_DIR}/data/processed/Heridal/validate'
TEST_PROCESSED_DIR = f'{ROOT_DIR}/data/processed/Heridal/test'

## 1. Data transformation
- Transform labels from xml format to yolo .txt files
- Split train data into train and validate dataset

After this step processed data directory should look like this:
```
data
└───processed
    └───Heridal
        ├───test
        │   ├───images
        │   └───labels
        ├───train
        │   ├───images
        │   └───labels
        └───validate
            ├───images
            └───labels
```


## 1.1 Transform labels from xml format to yolo .txt files

Yolo format:
- One *.txt file per image (if no objects in image, no *.txt file is required).
- One row per object.
- Each row is `class x_center y_center scaled_width scaled_height` format, separated by space.
- Box coordinates must be in normalized from 0 to 1. If your boxes are in pixels, divide x_center and width by image width, and y_center and height by image height.
- Bounding box in annotation in xywhn format.
- Class numbers are zero-indexed (start from 0).
- Files are saved into `data/Heridal/processed/test` and `data/Heridal/processed/train` to images and labels directory.


In [None]:
def process_directory(input_directory, output_directory):
    Path(f'{output_directory}/images').mkdir(parents=True, exist_ok=True)
    Path(f'{output_directory}/labels').mkdir(parents=True, exist_ok=True)

    files = [f for f in os.listdir(input_directory) if os.path.isfile(os.path.join(input_directory, f))]
    for image_file in files:
        image_filename = Path(image_file).stem
        image_filepath = os.path.join(input_directory, image_file)

        image = cv2.imread(image_filepath)
        height, width, channels = image.shape
        image_size = (width, height)

        xml_filepath = f'{input_directory}/labels/{image_filename}.xml'

        output_image_filepath = f'{output_directory}/images/{image_file}'
        output_labels_filepath = f'{output_directory}/labels/{image_filename}.txt'

        if not os.path.exists(xml_filepath):
            print(f'Warning: file {image_file} does not have corresponding labels file - skipping file')
            continue

        with open(xml_filepath) as xml:
            xml_dict = xmltodict.parse(xml.read())

            yolo_labels = []

            if xml_dict['annotation'] is None:
                print(f'Warning: file {image_file} does not contain any objects - skipping file')
                continue

            if isinstance(xml_dict['annotation']['object'], list):
                labels = xml_dict['annotation']['object']
            else:
                labels = [xml_dict['annotation']['object']]

            for label in labels:
                if label['name'] == 'human':
                    bbox = (int(label['bndbox']['xmin']), int(label['bndbox']['ymin']), int(label['bndbox']['xmax']), int(label['bndbox']['ymax']))
                    yolo_bbox = pbx.convert_bbox(bbox, image_size=image_size, from_type="voc", to_type="yolo")
                    yolo_label = (0,) + yolo_bbox
                    yolo_labels.append(yolo_label)
                else:
                    print("Warning: unknown object name")

            shutil.copyfile(image_filepath, output_image_filepath)

            with open(output_labels_filepath, 'w') as f:
                for label in yolo_labels:
                    line = ' '.join([str(l) for l in label])
                    f.write(f'{line}\n')

            #plot_xywhn_annotated_image_from_file(output_image_filepath, output_labels_filepath)

process_directory(TRAIN_DIR, TRAIN_PROCESSED_DIR)
process_directory(TEST_DIR, TEST_PROCESSED_DIR)

## 1.2 Split train data into train and validate dataset

Move random probes from `data/Heridal/processed/train` to `data/Heridal/processed/validate`.

In [None]:
random.seed(1)
np.random.seed(1)

images_dir = f'{TRAIN_PROCESSED_DIR}/images'
filenames = [f for f in os.listdir(images_dir) if os.path.isfile(os.path.join(images_dir, f))]
split = int(0.8 * len(filenames))

np.random.shuffle(filenames)
train_filenames = filenames[:split]
val_filenames = filenames[split:]

Path(f'{VAL_PROCESSED_DIR}/images').mkdir(parents=True, exist_ok=True)
Path(f'{VAL_PROCESSED_DIR}/labels').mkdir(parents=True, exist_ok=True)

#todo: move validate files to data/Heridal/processed/validate directory
for file in val_filenames:
    filename = Path(file).stem

    image_filepath = f'{TRAIN_PROCESSED_DIR}/images/{file}'
    label_filepath = f'{TRAIN_PROCESSED_DIR}/labels/{filename}.txt'

    output_image_filepath = f'{VAL_PROCESSED_DIR}/images/{file}'
    output_label_filepath = f'{VAL_PROCESSED_DIR}/labels/{filename}.txt'

    shutil.move(image_filepath, output_image_filepath)
    shutil.move(label_filepath, output_label_filepath)
