## Pre Processing

### Collating & Organizing Roboflow Datasets

In [31]:
import os
import shutil
from dotenv import load_dotenv

load_dotenv()
DATA_ROOT = os.getenv("DATA_ROOT")

docs_path = os.path.join(DATA_ROOT, "docs")
os.makedirs(docs_path, exist_ok=True)

In [32]:
# For each dataset folder
for directory in os.listdir(DATA_ROOT):
    dataset_path = os.path.join(DATA_ROOT, directory)
    if os.path.isdir(dataset_path):
        print(f"Directory: {directory}")

    # for each item in the dataset folder
    for content in os.listdir(dataset_path):
        if content.endswith(".yaml"):
            # Delete any existing YAML files
            file_path = os.path.join(dataset_path, content)
            try: 
                os.remove(file_path)
            except FileNotFoundError:
                pass
        elif content.startswith("README"):
            # move this file to the docs folder folder
            source = os.path.join(dataset_path, content)
            destination = os.path.join(docs_path, content)
            try:
                shutil.move(source, destination)
            except FileNotFoundError:
                pass
        
    for cont in os.listdir(dataset_path):
        print(f"Item: {cont}")
    
    print("\n")

Directory: abnormal-wood
Item: train


Directory: abnormal-wood-d2
Item: test
Item: train
Item: valid


Directory: bantalan
Item: test
Item: train
Item: valid


Directory: docs
Item: README.dataset.txt
Item: README.roboflow.txt


Directory: minejv9fu-concrete
Item: test
Item: train
Item: valid


Directory: rail-sleepers-final


Directory: sleeper3-7data
Item: test
Item: train
Item: valid


Directory: standardrail-sleeper
Item: test
Item: train
Item: valid


Directory: trilhos-wood
Item: test
Item: train
Item: valid




In [33]:
'''
for each dataset folder:
    if there exists subdirectories
        for each subdir:
            delete the labels folder
'''

for directory in os.listdir(DATA_ROOT):
    print(f"Directory name: {directory}")
    dir_path = os.path.join(DATA_ROOT, directory)

    # train, test or valid
    folder_list = ["test", "train", "valid"]
    for folder in os.listdir(dir_path):
        if folder in folder_list:
            subdir_path = os.path.join(dir_path, folder)
            for subfolder in os.listdir(subdir_path):
                if subfolder == "labels":
                    try:
                        labels_path = os.path.join(subdir_path, subfolder)
                        shutil.rmtree(labels_path)
                        print(f"Successfully removed labels from: {subdir_path}!")
                    except FileNotFoundError:
                        pass
                elif subfolder == "images":
                    try:
                        images_path = os.path.join(subdir_path, subfolder)
                        for image in os.listdir(images_path):
                            if image.lower().endswith(('.jpg', '.jpeg', '.png')):
                                src = os.path.join(images_path, image)
                                dst = os.path.join(subdir_path, image)
                                shutil.move(src, dst)
                                print(f"Moved: {image}")
                        
                        shutil.rmtree(images_path)
                        print(f"Deleted empty folder: {images_path}")
                    
                    except Exception as e:
                        print(f"Error moving from {images_path}: {e}")
   
    print("\n")

Directory name: abnormal-wood


Directory name: abnormal-wood-d2


Directory name: bantalan


Directory name: docs


Directory name: minejv9fu-concrete


Directory name: rail-sleepers-final


Directory name: sleeper3-7data


Directory name: standardrail-sleeper


Directory name: trilhos-wood




In [35]:
# Final Dataset folder
final_dataset_path = os.path.join(DATA_ROOT, "rail-sleepers-final")
os.makedirs(final_dataset_path, exist_ok=True)

# For each dataset folder --> take all the images from the subfolder and move all images in the subfolder to the dataset folder
for directory in os.listdir(DATA_ROOT):
    print(f"Directory name: {directory}")
    dir_path = os.path.join(DATA_ROOT, directory)
    
    for subfolder in os.listdir(dir_path):
        print(f"Subfolder name: {subfolder}")
        subfolder_path = os.path.join(dir_path, subfolder)

        if os.path.isdir(subfolder_path):
            for image in os.listdir(subfolder_path):
                print(f"Image file: {image}")
                img_path = os.path.join(subfolder_path, image) 
                if image.lower().endswith(('.jpg', '.jpeg', '.png')):
                    dst = os.path.join(dir_path, image)
                    shutil.move(img_path, dst)
                    print(f"Moved {image} from {img_path} to {dst}")
    print("\n")


Directory name: abnormal-wood
Subfolder name: train
Image file: Rail-11_000001_jpg.rf.1b157a47106a2f4790f8f1ec4312ce1a.jpg
Moved Rail-11_000001_jpg.rf.1b157a47106a2f4790f8f1ec4312ce1a.jpg from C:\Users\advay\CertificationProjects\NSF-URDA-REU-RailroadFaultDetection\data\abnormal-wood\train\Rail-11_000001_jpg.rf.1b157a47106a2f4790f8f1ec4312ce1a.jpg to C:\Users\advay\CertificationProjects\NSF-URDA-REU-RailroadFaultDetection\data\abnormal-wood\Rail-11_000001_jpg.rf.1b157a47106a2f4790f8f1ec4312ce1a.jpg
Image file: Rail-11_000002_jpg.rf.c200c353a5731a667132e91c2f78d2e7.jpg
Moved Rail-11_000002_jpg.rf.c200c353a5731a667132e91c2f78d2e7.jpg from C:\Users\advay\CertificationProjects\NSF-URDA-REU-RailroadFaultDetection\data\abnormal-wood\train\Rail-11_000002_jpg.rf.c200c353a5731a667132e91c2f78d2e7.jpg to C:\Users\advay\CertificationProjects\NSF-URDA-REU-RailroadFaultDetection\data\abnormal-wood\Rail-11_000002_jpg.rf.c200c353a5731a667132e91c2f78d2e7.jpg
Image file: Rail-11_000003_jpg.rf.8f21f92095

In [37]:
for directory in os.listdir(DATA_ROOT):
    print(f"Directory name: {directory}")
    dir_path = os.path.join(DATA_ROOT, directory)

    # train, test or valid
    folder_list = ["test", "train", "valid"]
    for folder in os.listdir(dir_path):
        folder_path = os.path.join(dir_path, folder)

        if (folder in folder_list) and (os.path.isdir(folder_path)):
            # Delete the train, test, folder
            try:
                shutil.rmtree(folder_path)# This removes folder + all contents
                print(f"Successfully deleted: {folder_path}")
            except OSError as e:
                print(f"Could not delete {folder_path}: {e}")

Directory name: abnormal-wood
Directory name: abnormal-wood-d2
Directory name: bantalan
Directory name: docs
Directory name: minejv9fu-concrete
Directory name: rail-sleepers-final
Directory name: sleeper3-7data
Directory name: standardrail-sleeper
Directory name: trilhos-wood


In [51]:
for directory in os.listdir(DATA_ROOT):
    dir_path = os.path.join(DATA_ROOT, directory)
    file_count = 0
    for _, _, files in os.walk(dir_path):
        file_count += len(files)
    print(f"Dataset Name: {directory.ljust(25)}|   # of Files: {str(file_count).ljust(5)}")


Dataset Name: abnormal-wood            |   # of Files: 1647 
Dataset Name: abnormal-wood-d2         |   # of Files: 2512 
Dataset Name: bantalan                 |   # of Files: 2183 
Dataset Name: docs                     |   # of Files: 2    
Dataset Name: minejv9fu-concrete       |   # of Files: 1004 
Dataset Name: rail-sleepers-final      |   # of Files: 0    
Dataset Name: sleeper3-7data           |   # of Files: 4305 
Dataset Name: standardrail-sleeper     |   # of Files: 1792 
Dataset Name: trilhos-wood             |   # of Files: 3032 
