# Data Preprocessing

## Move some images

### Guns

In [28]:
import os
import random
import shutil

# define input and output directories
input_dir = '/media/rodri/Files/Datasets/Final_Dataset/guns/train'
output_dir = '/media/rodri/Files/Datasets/Final_Dataset/guns/output'

# define number of images to select and move
num_images = 4000

# create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# get list of all image files in input directory
image_files = [filename for filename in os.listdir(input_dir) if filename.endswith('.jpg')]

# randomly select `num_images` image files
selected_images = random.sample(image_files, num_images)

# iterate over selected images, move them to output directory and find the corresponding annotations
for filename in selected_images:
    # move image file to output directory
    src_path = os.path.join(input_dir, filename)
    dst_path = os.path.join(output_dir, filename)
    shutil.move(src_path, dst_path)

    # find corresponding annotation file
    annotation_filename = filename.replace('.jpg', '.xml')
    annotation_src_path = os.path.join(input_dir, annotation_filename)
    annotation_dst_path = os.path.join(output_dir, annotation_filename)

    # move annotation file to output directory
    shutil.move(annotation_src_path, annotation_dst_path)


### Knives

In [2]:
import os
import random
import shutil

# define input and output directories
input_dir = '/media/rodri/Files/Datasets/Final_Dataset/knives/train'
output_dir = '/media/rodri/Files/Datasets/Final_Dataset/knives/output'

# define number of images to select and move
num_images = 1540

# create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# get list of all image files in input directory
image_files = [filename for filename in os.listdir(input_dir) if filename.endswith('.jpg')]

# randomly select `num_images` image files
selected_images = random.sample(image_files, num_images)

# iterate over selected images, move them to output directory and find the corresponding annotations
for filename in selected_images:
    # move image file to output directory
    src_path = os.path.join(input_dir, filename)
    dst_path = os.path.join(output_dir, filename)
    shutil.move(src_path, dst_path)

    # find corresponding annotation file
    annotation_filename = filename.replace('.jpg', '.xml')
    annotation_src_path = os.path.join(input_dir, annotation_filename)
    annotation_dst_path = os.path.join(output_dir, annotation_filename)

    # move annotation file to output directory
    shutil.move(annotation_src_path, annotation_dst_path)


## Split data into folders

### Guns

In [29]:
import os
import shutil
from tqdm import tqdm

input_dir = '/media/rodri/Files/Datasets/Final_Dataset/guns/output'
output_dir = '/media/rodri/Files/Datasets/Final_Dataset/guns/output2'
batch_size = 4

# create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# define a generator that yields batches of files
def batch_generator(file_list, batch_size):
    for i in range(0, len(file_list), batch_size):
        yield file_list[i:i+batch_size]

# get a list of all image files in the input directory
image_files = [filename for filename in os.listdir(input_dir) if filename.endswith('.jpg')]

# use the batch generator to iterate over batches of files
for i, file_batch in enumerate(tqdm(batch_generator(image_files, batch_size))):
    # create a new folder for the batch
    batch_dir = os.path.join(output_dir, f'batch_{i}')
    os.mkdir(batch_dir)

    # iterate over the image files in the batch
    for filename in file_batch:
        # copy the image file to the batch directory
        src_path = os.path.join(input_dir, filename)
        dst_path = os.path.join(batch_dir, filename)
        shutil.copy(src_path, dst_path)

        # find the corresponding annotation file
        annotation_filename = filename.replace('.jpg', '.xml')
        annotation_src_path = os.path.join(input_dir, annotation_filename)
        annotation_dst_path = os.path.join(batch_dir, annotation_filename)

        # copy the annotation file to the batch directory
        shutil.copy(annotation_src_path, annotation_dst_path)

1000it [03:08,  5.31it/s]


### Knives

In [5]:
import os
import shutil
from tqdm import tqdm

input_dir = '/media/rodri/Files/Datasets/Final_Dataset/knives/output'
output_dir = '/media/rodri/Files/Datasets/Final_Dataset/knives/output2'
batch_size = 4

# create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# define a generator that yields batches of files
def batch_generator(file_list, batch_size):
    for i in range(0, len(file_list), batch_size):
        yield file_list[i:i+batch_size]

# get a list of all image files in the input directory
image_files = [filename for filename in os.listdir(input_dir) if filename.endswith('.jpg')]

# use the batch generator to iterate over batches of files
for i, file_batch in enumerate(tqdm(batch_generator(image_files, batch_size))):
    # create a new folder for the batch
    batch_dir = os.path.join(output_dir, f'batch_{i}')
    os.mkdir(batch_dir)

    # iterate over the image files in the batch
    for filename in file_batch:
        # copy the image file to the batch directory
        src_path = os.path.join(input_dir, filename)
        dst_path = os.path.join(batch_dir, filename)
        shutil.copy(src_path, dst_path)

        # find the corresponding annotation file
        annotation_filename = filename.replace('.jpg', '.xml')
        annotation_src_path = os.path.join(input_dir, annotation_filename)
        annotation_dst_path = os.path.join(batch_dir, annotation_filename)

        # copy the annotation file to the batch directory
        shutil.copy(annotation_src_path, annotation_dst_path)

385it [01:10,  5.50it/s]


## Merge images

### Guns

In [31]:
import cv2
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm
import numpy as np

input_dir = "/media/rodri/Files/Datasets/Final_Dataset/guns/output2"
output_dir = "/media/rodri/Files/Datasets/Final_Dataset/guns/output3"
size = 640

def merge_image(folder_path, file_name, merged_image, x_offset, y_offset, idx):
    # load the image and resize it
    image_path = os.path.join(folder_path, file_name)
    image = cv2.imread(image_path)
    resized_image = cv2.resize(image, (int(size/2), int(size/2)))
    
    # place the resized image on the merged image
    merged_image[y_offset:y_offset+int(size/2), x_offset:x_offset+int(size/2)] = resized_image
    
    # update the x_offset and y_offset
    x_offset += int(size/2)
    if x_offset == size:
        x_offset = 0
        y_offset += int(size/2)
        
    return merged_image, x_offset, y_offset

def generate_bounding_boxes_as_xml(folder_path, file_name, idx):
    
    tree = ET.parse(os.path.join(folder_path, file_name))
    root = tree.getroot()
    
    offsets = [
        {'x': 0, 'y': 0},
        {'x': int(size/2), 'y': 0},
        {'x': 0, 'y': int(size/2)},
        {'x': int(size/2), 'y': int(size/2)}
    ]
    
    objects = []
    for obj in root.findall('object'):
        name = obj.find('name').text
        pose = obj.find('pose').text if obj.find('pose') is not None else 'Unspecified'
        truncated = obj.find('truncated').text if obj.find('truncated') is not None else 'Unspecified'
        difficult = obj.find('difficult').text if obj.find('difficult') is not None else 'Unspecified'
        bbox = obj.find('bndbox')
        xmin = int(bbox.find('xmin').text)//2 + offsets[idx]['x']
        xmax = int(bbox.find('xmax').text)//2 + offsets[idx]['x']
        ymin = int(bbox.find('ymin').text)//2 + offsets[idx]['y']
        ymax = int(bbox.find('ymax').text)//2 + offsets[idx]['y']
        
        objects.append({
            'name': name,
            'pose': pose,
            'truncated': truncated,
            'difficult': difficult,
            'bbox': {
                'xmin': xmin,
                'xmax': xmax,
                'ymin': ymin,
                'ymax': ymax,
            }
        })
    
    str_objects = ""
    for obj in objects:
        obj_str = "\t<object>\n"
        for key, value in obj.items():
            if key == 'bbox':
                obj_str += "\t\t<bndbox>\n"
                for bbox_key, bbox_value in value.items():
                    obj_str += f"\t\t\t<{bbox_key}>{bbox_value}</{bbox_key}>\n"
                obj_str += "\t\t</bndbox>\n"
            else:
                obj_str += f"\t\t<{key}>{value}</{key}>\n"
        obj_str += "\t</object>\n"
        str_objects += obj_str
    
    return str_objects
    
def generate_string_xml_annotation_beginning(folder_path, file_name):
    tree = ET.parse(os.path.join(folder_path, file_name))
    root = tree.getroot()
    
    # Change filename and path by folder name
    root.find('filename').text = folder_path.split("/")[-1] + ".jpg"
    # root.find('path').text = folder_path.split("/")[-1] + ".jpg"
        
    xml_string = ET.tostring(root).decode()
    
    return xml_string.split("<object>")[0]

def generate_string_xml_annotation_end(folder_path, file_name):
    tree = ET.parse(os.path.join(folder_path, file_name))
    root = tree.getroot()
        
    xml_string = ET.tostring(root).decode()
        
    return xml_string.split("</object>")[-1]
    

def merge_images(folder_path, output_path):
    # create a black merged image
    merged_image = 255 * np.ones(shape=[size, size, 3], dtype=np.uint8)
    x_offset = 0
    y_offset = 0
    
    annotation = generate_string_xml_annotation_beginning(folder_path, os.listdir(folder_path)[1])
    
    # loop over the images in the folder
    for idx, file_name in enumerate(os.listdir(folder_path)):
        if file_name.endswith(".jpg"):
            merged_image, x_offset, y_offset = merge_image(folder_path, file_name, merged_image, x_offset, y_offset, idx//2)
            
        if file_name.endswith(".xml"):
            annotation += generate_bounding_boxes_as_xml(folder_path, file_name, idx//2)
            
    # save the merged images
    output_image_path = os.path.join(output_path, folder_path.split("/")[-1]+".jpg")
    cv2.imwrite(output_image_path, merged_image)
    
    # save the merged annotations
    annotation += generate_string_xml_annotation_end(folder_path, os.listdir(folder_path)[1])
    with open(os.path.join(output_path, folder_path.split("/")[-1]+".xml"), "w") as f:
        f.write(annotation)
    

# create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# use a generator to iterate over the folders in the directory and call the merge_images function for each folder
for input_path in tqdm(os.listdir(input_dir)):
    input_path = os.path.join(input_dir, input_path)
    if os.path.isdir(input_path):
        # merge the images in the folder
        merge_images(input_path, output_dir)


100%|██████████| 1000/1000 [03:01<00:00,  5.52it/s]


### Knives

In [33]:
import cv2
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm
import numpy as np

input_dir = "/media/rodri/Files/Datasets/Final_Dataset/knives/output2"
output_dir = "/media/rodri/Files/Datasets/Final_Dataset/knives/output3"
size = 640

def merge_image(folder_path, file_name, merged_image, x_offset, y_offset, idx):
    # load the image and resize it
    image_path = os.path.join(folder_path, file_name)
    image = cv2.imread(image_path)
    resized_image = cv2.resize(image, (int(size/2), int(size/2)))
    
    # place the resized image on the merged image
    merged_image[y_offset:y_offset+int(size/2), x_offset:x_offset+int(size/2)] = resized_image
    
    # update the x_offset and y_offset
    x_offset += int(size/2)
    if x_offset == size:
        x_offset = 0
        y_offset += int(size/2)
        
    return merged_image, x_offset, y_offset

def generate_bounding_boxes_as_xml(folder_path, file_name, idx):
    tree = ET.parse(os.path.join(folder_path, file_name))
    root = tree.getroot()
    
    offsets = [
        {'x': 0, 'y': 0},
        {'x': int(size/2), 'y': 0},
        {'x': 0, 'y': int(size/2)},
        {'x': int(size/2), 'y': int(size/2)}
    ]
    
    objects = []
    for obj in root.findall('object'):
        name = obj.find('name').text
        pose = obj.find('pose').text
        truncated = obj.find('truncated').text
        difficult = obj.find('difficult').text
        bbox = obj.find('bndbox')
        xmin = int(bbox.find('xmin').text)//2 + offsets[idx]['x']
        xmax = int(bbox.find('xmax').text)//2 + offsets[idx]['x']
        ymin = int(bbox.find('ymin').text)//2 + offsets[idx]['y']
        ymax = int(bbox.find('ymax').text)//2 + offsets[idx]['y']
        
        objects.append({
            'name': name,
            'pose': pose,
            'truncated': truncated,
            'difficult': difficult,
            'bbox': {
                'xmin': xmin,
                'xmax': xmax,
                'ymin': ymin,
                'ymax': ymax,
            }
        })
    
    str_objects = ""
    for obj in objects:
        obj_str = "\t<object>\n"
        for key, value in obj.items():
            if key == 'bbox':
                obj_str += "\t\t<bndbox>\n"
                for bbox_key, bbox_value in value.items():
                    obj_str += f"\t\t\t<{bbox_key}>{bbox_value}</{bbox_key}>\n"
                obj_str += "\t\t</bndbox>\n"
            else:
                obj_str += f"\t\t<{key}>{value}</{key}>\n"
        obj_str += "\t</object>\n"
        str_objects += obj_str
    
    return str_objects
    
def generate_string_xml_annotation_beginning(folder_path, file_name):
    tree = ET.parse(os.path.join(folder_path, file_name))
    root = tree.getroot()
    
    # Change filename and path by folder name
    root.find('filename').text = folder_path.split("/")[-1] + ".jpg"
    root.find('path').text = folder_path.split("/")[-1] + ".jpg"
        
    xml_string = ET.tostring(root).decode()
    
    return xml_string.split("<object>")[0]

def generate_string_xml_annotation_end(folder_path, file_name):
    tree = ET.parse(os.path.join(folder_path, file_name))
    root = tree.getroot()
        
    xml_string = ET.tostring(root).decode()
        
    return xml_string.split("</object>")[-1]
    

def merge_images(folder_path, output_path):
    # create a black merged image
    merged_image = 255 * np.ones(shape=[size, size, 3], dtype=np.uint8)
    x_offset = 0
    y_offset = 0
    
    annotation = generate_string_xml_annotation_beginning(folder_path, os.listdir(folder_path)[1])
    
    # loop over the images in the folder
    for idx, file_name in enumerate(os.listdir(folder_path)):
        if file_name.endswith(".jpg"):
            merged_image, x_offset, y_offset = merge_image(folder_path, file_name, merged_image, x_offset, y_offset, idx//2)
            
        if file_name.endswith(".xml"):
            annotation += generate_bounding_boxes_as_xml(folder_path, file_name, idx//2)
            
    # save the merged images
    output_image_path = os.path.join(output_path, folder_path.split("/")[-1]+".jpg")
    cv2.imwrite(output_image_path, merged_image)
    
    # save the merged annotations
    annotation += generate_string_xml_annotation_end(folder_path, os.listdir(folder_path)[1])
    with open(os.path.join(output_path, folder_path.split("/")[-1]+".xml"), "w") as f:
        f.write(annotation)
    

# create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# use a generator to iterate over the folders in the directory and call the merge_images function for each folder
for input_path in tqdm(os.listdir(input_dir)):
    input_path = os.path.join(input_dir, input_path)
    if os.path.isdir(input_path):
        # merge the images in the folder
        merge_images(input_path, output_dir)


100%|██████████| 385/385 [00:31<00:00, 12.37it/s]


## Merge results and clean

### Guns

In [32]:
import os
import shutil
from tqdm import tqdm

input_path = '/media/rodri/Files/Datasets/Final_Dataset/guns'

# Create upload folder
os.makedirs(f"{input_path}/upload", exist_ok=True)

# Folder generator
def file_generator(folder_path):
    for file in os.listdir(folder_path):
        yield file

# Copy all content of train and output3 to upload
for file in tqdm(file_generator(f"{input_path}/train"), desc="Copying train"):
    shutil.copy(f"{input_path}/train/{file}", f"{input_path}/upload")
for file in tqdm(file_generator(f"{input_path}/output3"), desc="Copying output3"):
    shutil.copy(f"{input_path}/output3/{file}", f"{input_path}/upload")

# Remove all generated folders
shutil.rmtree(f"{input_path}/train")
shutil.rmtree(f"{input_path}/output")
shutil.rmtree(f"{input_path}/output2")
shutil.rmtree(f"{input_path}/output3")

# Print the number of files in the upload folder
print(f"Number of files in upload folder: {len(os.listdir(f'{input_path}/upload'))}")

Copying train: 8000it [05:32, 24.02it/s]
Copying output3: 2000it [01:11, 27.85it/s]


Number of files in upload folder: 10000


### Knives

In [6]:
import os
import shutil
from tqdm import tqdm

input_path = '/media/rodri/Files/Datasets/Final_Dataset/knives'

# Create upload folder
os.makedirs(f"{input_path}/upload", exist_ok=True)

# Folder generator
def file_generator(folder_path):
    for file in os.listdir(folder_path):
        yield file

# Copy all content of train and output3 to upload
for file in tqdm(file_generator(f"{input_path}/train"), desc="Copying train"):
    shutil.copy(f"{input_path}/train/{file}", f"{input_path}/upload")
for file in tqdm(file_generator(f"{input_path}/output3"), desc="Copying output3"):
    shutil.copy(f"{input_path}/output3/{file}", f"{input_path}/upload")

# Remove all generated folders
shutil.rmtree(f"{input_path}/train")
shutil.rmtree(f"{input_path}/output")
shutil.rmtree(f"{input_path}/output2")
shutil.rmtree(f"{input_path}/output3")

# Print the number of files in the upload folder
print(f"Number of files in upload folder: {len(os.listdir(f'{input_path}/upload'))}")

Copying train: 9226it [04:03, 37.86it/s] 
Copying output3: 770it [00:30, 25.11it/s]


Number of files in upload folder: 9996


## Split upload folder into four batches

### Guns

In [1]:
import os
import shutil
from tqdm import tqdm

input_path = '/media/rodri/Files/Datasets/Final_Dataset/guns/new_upload'
output_path = '/media/rodri/Files/Datasets/Final_Dataset/guns/upload_split'
threshold = 0

# Create upload_split folder
os.makedirs(f"{output_path}", exist_ok=True)

# Folder generator
def file_generator(folder_path):
    for idx, file in enumerate(os.listdir(folder_path)):
        yield idx, file

# Split upload folder into 4 folders
for idx, file in tqdm(file_generator(f"{input_path}"), desc="Splitting upload"):
    if idx == threshold:
        threshold += 2500
        os.makedirs(f"{output_path}/batch_{threshold//2500}/", exist_ok=True)
    shutil.copy(f"{input_path}/{file}", f"{output_path}/batch_{threshold//2500}")

Splitting upload: 10000it [03:14, 51.45it/s]


### Knives

In [24]:
import os
import shutil
from tqdm import tqdm

input_path = '/media/rodri/Files/Datasets/Final_Dataset/knives/upload'
output_path = '/media/rodri/Files/Datasets/Final_Dataset/knives/upload_split'
threshold = 0

# Create upload_split folder
os.makedirs(f"{output_path}", exist_ok=True)

# Folder generator
def file_generator(folder_path):
    for idx, file in enumerate(os.listdir(folder_path)):
        yield idx, file

# Split upload folder into 4 folders
for idx, file in tqdm(file_generator(f"{input_path}"), desc="Splitting upload"):
    if idx == threshold:
        threshold += 2500
        os.makedirs(f"{output_path}/batch_{threshold//2500}/", exist_ok=True)
    shutil.copy(f"{input_path}/{file}", f"{output_path}/batch_{threshold//2500}")

Splitting upload: 9996it [04:15, 39.17it/s] 
