In [None]:
import os
import shutil
import random
import csv
import json
from sklearn.model_selection import train_test_split

In [None]:
# Splitting the dataset into train, test, and unmatched 

# Define the directory paths
dataset_path = 'dataset'
train_path = 'train'
test_path = 'test'
unmatched_path = 'unmatched'

# Create the test, train, and unmatched directories
for path in [train_path, test_path, unmatched_path]:
    os.makedirs(path, exist_ok=True)

# Collect all images and annotations filenames
images = [f for f in os.listdir(dataset_path) if f.endswith('.jpeg')]
annotations = [f for f in os.listdir(dataset_path) if f.endswith('.json')]

# init list for matched images and their corresponding annotation and unmatched images
# I am assuming that all annotations have a corresponding image but not all images have a corresponding annotation
# Also assuming that the match is basically the same file name but differnet file extension
matched = []
unmatched = []

# match annotation with image
for image in images:
    base_file_name = image.rsplit('.', 1)[0]
    annotation = base_file_name + '.json'
    if annotation in annotations:
        matched.append((image, annotation))
    else:
        unmatched.append(image)

# 80:20 Train/Test Split 
# Random State 42
train_pairs, test_pairs = train_test_split(matched, test_size=0.2, random_state=42)

# Helper func to move files
def move_files(file_pairs, destination):
    for img_file, annotation in file_pairs:
        shutil.move(os.path.join(dataset_path, img_file), os.path.join(destination, img_file))
        shutil.move(os.path.join(dataset_path, annotation), os.path.join(destination, annotation))

# Call helper func to move the files
move_files(train_pairs, train_path)
move_files(test_pairs, test_path)

# Move unmatched images to a seperate directory
# Can we use these images for training still or do we need to create annotations for them?
for img_file in unmatched:
    shutil.move(os.path.join(dataset_path, img_file), os.path.join(unmatched_path, img_file))

print(f'Training Files: {len(train_pairs) * 2}')
print(f'Testing Files: {len(test_pairs) * 2}')
print(f'Unmatched Images: {len(unmatched)}')

In [None]:
# After splitting the images and annotation randomly check if an image in the test and train has a corresponding json file
# Check that there are no json files in the unmatched Directory

def check_matching_files(directory):
    # List all files 
    files = os.listdir(directory)
    # Separate images and annotations
    json_files = [f for f in files if f.endswith('.json')]
    jpeg_files = [f for f in files if f.endswith('.jpeg')]
    
    # Check directory not empty
    if not json_files or not jpeg_files:
        return "Empty"
    
    # Randomly choose a file type 
    chosen_file_type = random.choice(['json', 'jpeg'])
    if chosen_file_type == 'json':
        chosen_file = random.choice(json_files)
        corresponding_file = chosen_file.replace('.json', '.jpeg')
    else:
        chosen_file = random.choice(jpeg_files)
        corresponding_file = chosen_file.replace('.jpeg', '.json')
    
    # Check if the corresponding file exists
    if corresponding_file in (json_files if chosen_file_type == 'jpeg' else jpeg_files):
        return "Success"
    else:
        return f"Big problem!!!! {chosen_file}"

def check_unmatched_directory(directory):
    # List all files 
    files = os.listdir(directory)
    # Check for json files
    json_files = [f for f in files if f.endswith('.json')]
    
    if json_files:
        return f"There are json files: {', '.join(json_files)}"
    else:
        return "Success"

train_dir = 'train'
test_dir = 'test'
unmatched_dir = 'unmatched'

# Run N times
N = 10
for i in range(N):
    train_check = check_matching_files(train_dir)
    test_check = check_matching_files(test_dir)
    unmatched_check = check_unmatched_directory(unmatched_dir)

    print("Train Check:", train_check)
    print("Test Check:", test_check)
    print("Unmatched Check:", unmatched_check)

In [None]:
# Extracting the names from the spareit labelling csv
def extract_names(file_path):
    names = []
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)
        for row in reader:
            if row:  
                names.append(row[0]) 
    return names

file_path = 'Spare-It Labelling Reference for 3d Party.csv'
names = extract_names(file_path)

# print(names)
# len(names)

In [None]:
# Split json and jpeg 
def organize_dataset(base_dir):
    for dataset_type in ['train', 'test']:
        dataset_path = os.path.join(base_dir, dataset_type)
        
        images_dir = os.path.join(dataset_path, 'images')
        labels_dir = os.path.join(dataset_path, 'labels')
        os.makedirs(images_dir, exist_ok=True)
        os.makedirs(labels_dir, exist_ok=True)
        
        for filename in os.listdir(dataset_path):
            if filename.endswith('.jpeg'):
                shutil.move(os.path.join(dataset_path, filename), images_dir)
            elif filename.endswith('.json'):
                shutil.move(os.path.join(dataset_path, filename), labels_dir)

base_dir = ''  
organize_dataset(base_dir)

In [None]:
# Convert JSON to YOLO format (txt)
def extract_categories(json_file):
    with open(json_file) as f:
        data = json.load(f)
    categories = data['categories']
    print(categories)
    return [{"id": cat["id"], "name": cat["name"]} for cat in categories]

def convert_json_to_yolo(json_file, output_dir, categories):
    with open(json_file) as f:
        data = json.load(f)

    category_id_to_index = {category["id"]: index for index, category in enumerate(categories)}
    
    os.makedirs(output_dir, exist_ok=True)
    base_filename = os.path.splitext(os.path.basename(json_file))[0]
    txt_filename = base_filename + '.txt'
    txt_path = os.path.join(output_dir, txt_filename)

    with open(txt_path, 'w') as f:
        for ann in data['annotations']:
            if ann['category_id'] not in category_id_to_index:
                print(f"Unknown category_id {ann['category_id']} in {json_file}")
                continue
            category_index = category_id_to_index[ann['category_id']]
            bbox = ann['bbox']
            x_center = (bbox[0] + bbox[2] / 2) / data['images'][0]['width']
            y_center = (bbox[1] + bbox[3] / 2) / data['images'][0]['height']
            width = bbox[2] / data['images'][0]['width']
            height = bbox[3] / data['images'][0]['height']
            line = f"{category_index} {x_center} {y_center} {width} {height}"
            f.write(line + '\n')

def process_all_json_files(directory, output_dir):
    first_json_file = next((f for f in os.listdir(directory) if f.endswith('.json')), None)
    if first_json_file is None:
        print("No JSON files found in the directory.")
        return
    categories = extract_categories(os.path.join(directory, first_json_file))
    
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            json_file = os.path.join(directory, filename)
            convert_json_to_yolo(json_file, output_dir, categories)

annotations_dir = 'train/annotations' 
output_dir = 'train/labels'
process_all_json_files(annotations_dir, output_dir)

In [None]:
annotations_dir = 'test/annotations' 
output_dir = 'test/labels'
process_all_json_files(annotations_dir, output_dir)

In [None]:
# To extract the category names for the data.yaml folder
def extract_categories(json_file):
    with open(json_file) as f:
        data = json.load(f)
    categories = data['categories']
    print([cat["name"] for cat in categories])

directory='train/annotations'
first_json_file = next((f for f in os.listdir(directory) if f.endswith('.json')), None)
extract_categories(os.path.join(directory, first_json_file))