# Data Preprocessing

## Move some images

### Guns

In [3]:
import os
import random
import shutil

# define input and output directories
input_dir = '/media/rodri/Files/Datasets/Final_Dataset/guns/train'
output_dir = '/media/rodri/Files/Datasets/Final_Dataset/guns/output'

# define number of images to select and move
num_images = 1300

# create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# get list of all image files in input directory
image_files = [filename for filename in os.listdir(input_dir) if filename.endswith('.jpg')]

# randomly select `num_images` image files
selected_images = random.sample(image_files, num_images)

# iterate over selected images, move them to output directory and find the corresponding annotations
for filename in selected_images:
    # move image file to output directory
    src_path = os.path.join(input_dir, filename)
    dst_path = os.path.join(output_dir, filename)
    shutil.move(src_path, dst_path)

    # find corresponding annotation file
    annotation_filename = filename.replace('.jpg', '.xml')
    annotation_src_path = os.path.join(input_dir, annotation_filename)
    annotation_dst_path = os.path.join(output_dir, annotation_filename)

    # move annotation file to output directory
    shutil.move(annotation_src_path, annotation_dst_path)


### Knives

In [2]:
import os
import random
import shutil

# define input and output directories
input_dir = '/media/rodri/Files/Datasets/Final_Dataset/knives/train'
output_dir = '/media/rodri/Files/Datasets/Final_Dataset/knives/output'

# define number of images to select and move
num_images = 1540

# create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# get list of all image files in input directory
image_files = [filename for filename in os.listdir(input_dir) if filename.endswith('.jpg')]

# randomly select `num_images` image files
selected_images = random.sample(image_files, num_images)

# iterate over selected images, move them to output directory and find the corresponding annotations
for filename in selected_images:
    # move image file to output directory
    src_path = os.path.join(input_dir, filename)
    dst_path = os.path.join(output_dir, filename)
    shutil.move(src_path, dst_path)

    # find corresponding annotation file
    annotation_filename = filename.replace('.jpg', '.xml')
    annotation_src_path = os.path.join(input_dir, annotation_filename)
    annotation_dst_path = os.path.join(output_dir, annotation_filename)

    # move annotation file to output directory
    shutil.move(annotation_src_path, annotation_dst_path)


## Split data into folders

### Guns

In [4]:
import os
import shutil
from tqdm import tqdm

input_dir = '/media/rodri/Files/Datasets/Final_Dataset/guns/output'
output_dir = '/media/rodri/Files/Datasets/Final_Dataset/guns/output2'
batch_size = 4

# create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# define a generator that yields batches of files
def batch_generator(file_list, batch_size):
    for i in range(0, len(file_list), batch_size):
        yield file_list[i:i+batch_size]

# get a list of all image files in the input directory
image_files = [filename for filename in os.listdir(input_dir) if filename.endswith('.jpg')]

# use the batch generator to iterate over batches of files
for i, file_batch in enumerate(tqdm(batch_generator(image_files, batch_size))):
    # create a new folder for the batch
    batch_dir = os.path.join(output_dir, f'batch_{i}')
    os.mkdir(batch_dir)

    # iterate over the image files in the batch
    for filename in file_batch:
        # copy the image file to the batch directory
        src_path = os.path.join(input_dir, filename)
        dst_path = os.path.join(batch_dir, filename)
        shutil.copy(src_path, dst_path)

        # find the corresponding annotation file
        annotation_filename = filename.replace('.jpg', '.xml')
        annotation_src_path = os.path.join(input_dir, annotation_filename)
        annotation_dst_path = os.path.join(batch_dir, annotation_filename)

        # copy the annotation file to the batch directory
        shutil.copy(annotation_src_path, annotation_dst_path)

325it [00:56,  5.74it/s]


### Knives

In [5]:
import os
import shutil
from tqdm import tqdm

input_dir = '/media/rodri/Files/Datasets/Final_Dataset/knives/output'
output_dir = '/media/rodri/Files/Datasets/Final_Dataset/knives/output2'
batch_size = 4

# create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# define a generator that yields batches of files
def batch_generator(file_list, batch_size):
    for i in range(0, len(file_list), batch_size):
        yield file_list[i:i+batch_size]

# get a list of all image files in the input directory
image_files = [filename for filename in os.listdir(input_dir) if filename.endswith('.jpg')]

# use the batch generator to iterate over batches of files
for i, file_batch in enumerate(tqdm(batch_generator(image_files, batch_size))):
    # create a new folder for the batch
    batch_dir = os.path.join(output_dir, f'batch_{i}')
    os.mkdir(batch_dir)

    # iterate over the image files in the batch
    for filename in file_batch:
        # copy the image file to the batch directory
        src_path = os.path.join(input_dir, filename)
        dst_path = os.path.join(batch_dir, filename)
        shutil.copy(src_path, dst_path)

        # find the corresponding annotation file
        annotation_filename = filename.replace('.jpg', '.xml')
        annotation_src_path = os.path.join(input_dir, annotation_filename)
        annotation_dst_path = os.path.join(batch_dir, annotation_filename)

        # copy the annotation file to the batch directory
        shutil.copy(annotation_src_path, annotation_dst_path)

385it [01:10,  5.50it/s]


## Merge images

### Guns

In [15]:
import cv2
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm
import numpy as np

input_dir = "/media/rodri/Files/Datasets/Final_Dataset/guns/output2"
output_dir = "/media/rodri/Files/Datasets/Final_Dataset/guns/output3"
size = 640

def merge_images(folder_path, output_path):
    # create a black merged image
    merged_image = 255 * np.ones(shape=[size, size, 3], dtype=np.uint8)
    x_offset = 0
    y_offset = 0
    idx = 0
    # loop over the files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".jpg"):
            # load the image and resize it
            image_path = os.path.join(folder_path, file_name)
            image = cv2.imread(image_path)
            resized_image = cv2.resize(image, (int(size/2), int(size/2)))
            
            # place the resized image on the merged image
            merged_image[y_offset:y_offset+int(size/2), x_offset:x_offset+int(size/2)] = resized_image
            
            # update the x_offset and y_offset
            x_offset += int(size/2)
            if x_offset == size:
                x_offset = 0
                y_offset += int(size/2)
                
        elif file_name.endswith(".xml"):
            # load the annotation and update the coordinates
            annotation_path = os.path.join(folder_path, file_name)
            annotation = ET.parse(annotation_path).getroot()
            
            match idx:
                case 0:
                    x_offset = 0
                    y_offset = 0
                case 1:
                    x_offset = int(size/2)
                    y_offset = 0
                case 2:
                    x_offset = 0
                    y_offset = int(size/2)
                case 3:
                    x_offset = int(size/2)
                    y_offset = int(size/2)
            
            # update the annotation coordinates to match the resized image's position on the merged image
            for obj in annotation.findall('object'):
                bndbox = obj.find('bndbox')
                xmin = int(bndbox.find('xmin').text) if int(bndbox.find('xmin').text) < int(bndbox.find('xmax').text) else int(bndbox.find('xmax').text)
                ymin = int(bndbox.find('ymin').text) if int(bndbox.find('ymin').text) < int(bndbox.find('ymax').text) else int(bndbox.find('ymax').text)
                xmax = int(bndbox.find('xmax').text) if int(bndbox.find('xmax').text) > int(bndbox.find('xmin').text) else int(bndbox.find('xmin').text)
                ymax = int(bndbox.find('ymax').text) if int(bndbox.find('ymax').text) > int(bndbox.find('ymin').text) else int(bndbox.find('ymin').text)
                bndbox.find('xmin').text = str(int(xmin/2)+x_offset)
                bndbox.find('ymin').text = str(int(ymin/2)+y_offset)
                bndbox.find('xmax').text = str(int(xmax/2)+x_offset)
                bndbox.find('ymax').text = str(int(ymax/2)+y_offset)
            idx += 1
            
    # save the merged image and annotation
    output_image_path = os.path.join(output_path, folder_path.split("/")[-1]+".jpg")
    output_annotation_path = os.path.join(output_path, folder_path.split("/")[-1]+".xml")
    cv2.imwrite(output_image_path, merged_image)
    ET.ElementTree(annotation).write(output_annotation_path)

# create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# use a generator to iterate over the folders in the directory and call the merge_images function for each folder
for folder_path in tqdm(os.listdir(input_dir)):
    folder_path = os.path.join(input_dir, folder_path)
    if os.path.isdir(folder_path):
        # merge the images in the folder
        merge_images(folder_path, output_dir)


100%|██████████| 325/325 [00:37<00:00,  8.75it/s]


### Knives

In [19]:
import cv2
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm
import numpy as np

input_dir = "/media/rodri/Files/Datasets/Final_Dataset/knives/output2"
output_dir = "/media/rodri/Files/Datasets/Final_Dataset/knives/output3"
size = 640

def merge_images(folder_path, output_path):
    # create a black merged image
    merged_image = 255 * np.ones(shape=[size, size, 3], dtype=np.uint8)
    x_offset = 0
    y_offset = 0
    
    # loop over the files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".jpg"):
            # load the image and resize it
            image_path = os.path.join(folder_path, file_name)
            image = cv2.imread(image_path)
            resized_image = cv2.resize(image, (int(size/2), int(size/2)))
            
            # place the resized image on the merged image
            merged_image[y_offset:y_offset+int(size/2), x_offset:x_offset+int(size/2)] = resized_image
            
            # update the x_offset and y_offset
            x_offset += int(size/2)
            if x_offset == size:
                x_offset = 0
                y_offset += int(size/2)
                
        elif file_name.endswith(".xml"):
            # load the annotation and update the coordinates
            annotation_path = os.path.join(folder_path, file_name)
            annotation = ET.parse(annotation_path).getroot()
            
            # update the annotation coordinates to match the resized image's position on the merged image
            for obj in annotation.findall('object'):
                bndbox = obj.find('bndbox')
                xmin = int(bndbox.find('xmin').text) if int(bndbox.find('xmin').text) < int(bndbox.find('xmax').text) else int(bndbox.find('xmax').text)
                ymin = int(bndbox.find('ymin').text) if int(bndbox.find('ymin').text) < int(bndbox.find('ymax').text) else int(bndbox.find('ymax').text)
                xmax = int(bndbox.find('xmax').text) if int(bndbox.find('xmax').text) > int(bndbox.find('xmin').text) else int(bndbox.find('xmin').text)
                ymax = int(bndbox.find('ymax').text) if int(bndbox.find('ymax').text) > int(bndbox.find('ymin').text) else int(bndbox.find('ymin').text)
                bndbox.find('xmin').text = str(int(xmin/2)+x_offset)
                bndbox.find('ymin').text = str(int(ymin/2)+y_offset)
                bndbox.find('xmax').text = str(int(xmax/2)+x_offset)
                bndbox.find('ymax').text = str(int(ymax/2)+y_offset)
            
    # save the merged image and annotation
    output_image_path = os.path.join(output_path, folder_path.split("/")[-1]+".jpg")
    output_annotation_path = os.path.join(output_path, folder_path.split("/")[-1]+".xml")
    cv2.imwrite(output_image_path, merged_image)
    ET.ElementTree(annotation).write(output_annotation_path)

# create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# use a generator to iterate over the folders in the directory and call the merge_images function for each folder
for folder_path in tqdm(os.listdir(input_dir)):
    folder_path = os.path.join(input_dir, folder_path)
    if os.path.isdir(folder_path):
        # merge the images in the folder
        merge_images(folder_path, output_dir)


 91%|█████████ | 349/385 [00:29<00:03, 11.75it/s]


KeyboardInterrupt: 

## Merge results and clean