<a href="https://colab.research.google.com/github/87tana/YOLOv8-Bone-Fracture-Detection-Model/blob/main/EDA_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

in this notebook, i assess the label distribution, bounding box statistic, image dimension and data quality

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)


# Navigate to the project directory
%cd '/content/drive/MyDrive/Project_Experiments/Bone_Fraction_Detection/'


# Define dataset path
dataset_path = '/content/drive/MyDrive/Project_Experiments/Bone_Fraction_Detection/Fraction_Detection_Dataset'

Mounted at /content/drive/
/content/drive/MyDrive/Project_Experiments/Bone_Fraction_Detection


In [2]:
# Install necessary libraries
!pip install -q ultralytics torch torchvision opencv-python pillow matplotlib tqdm

In [3]:
# Import required libraries
import os
import random
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt


from ultralytics import YOLO

In [4]:
# Train Dataset
train_images_dir = os.path.join(dataset_path, 'train/images')
train_labels_dir = os.path.join(dataset_path, 'train/labels')

# list comprehensions to create a list for storing train images and labels respectively
train_image_files = sorted([f for f in os.listdir(train_images_dir) if f.endswith('.jpg')])
train_label_files = sorted([f for f in os.listdir(train_labels_dir) if f.endswith('.txt')])


# Check if the number of images and labels match, ensure neither are empty, and verify consistency between image-label pairs.

if len(train_image_files) != len(train_label_files):
    print("Warning: Number of images and labels do not match.")
else:
    print("Train Dataset is consistent.")

# Quick summary of train set
print(f"Number of train images: {len(train_image_files)}")
print(f"Number of train labels: {len(train_label_files)}")

Train Dataset is consistent.
Number of train images: 3779
Number of train labels: 3779


In [5]:
# Valid Dataset
validation_images_dir = os.path.join(dataset_path, 'valid/images')
validation_labels_dir = os.path.join(dataset_path, 'valid/labels')

# list comprehensions to create a list for storing valid images and labels respectively
validation_image_files = sorted([f for f in os.listdir(validation_images_dir) if f.endswith('.jpg')])
validation_label_files = sorted([f for f in os.listdir(validation_labels_dir) if f.endswith('.txt')])


# Check if the number of images and labels match, ensure neither are empty, and verify consistency between image-label pairs.

if len(validation_image_files) != len(validation_label_files):
    print("Warning: Number of images and labels do not match.")
else:
    print("Validation Dataset is consistent.")

# Quick summary of train set
print(f"Number of validation images: {len(validation_image_files)}")
print(f"Number of validation labels: {len(validation_label_files)}")

Validation Dataset is consistent.
Number of validation images: 835
Number of validation labels: 835


In [7]:
# Test Dataset
test_images_dir = os.path.join(dataset_path, 'test/images')
test_labels_dir = os.path.join(dataset_path, 'test/labels')

# list comprehensions to create a list for storing valid images and labels respectively
test_image_files = sorted([f for f in os.listdir(test_images_dir) if f.endswith(('.jpg','.png'))])
test_label_files = sorted([f for f in os.listdir(test_labels_dir) if f.endswith('.txt')])


# Check if the number of images and labels match, ensure neither are empty, and verify consistency between image-label pairs.

if len(test_image_files) != len(test_label_files):
    print("Warning: Number of images and labels do not match.")
else:
    print("test Dataset is consistent.")

# Quick summary of train set
print(f"Number of test images: {len(test_image_files)}")
print(f"Number of test labels: {len(test_label_files)}")

test Dataset is consistent.
Number of test images: 841
Number of test labels: 841


# Comprehensive overview Dataset

In [12]:
# Function to get the image size, bbx details from annotations

def get_image_and_bboxes_info(image_file, label_file, images_dir, labels_dir):
    #Read the image to get the size(w,h)
    image_path = os.path.join(images_dir, image_file)
    image = cv2.imread(image_path)
    image_height, image_width = image.shape[:2]

    # Initialize bounding box count and size details
    bbox_count = 0
    bbox_widths = []
    bbox_heights = []

    # Read the annotation file to get the bounding boxes
    label_path = os.path.join(labels_dir, label_file)
    with open(label_path, 'r') as f:
        annotations = f.readlines()

    for annotation in annotations:
        class_id, x_center,y_center, bbox_width, bbox_height = map(float, annotation.split())
        # Count bounding boxes and store width/height
        bbox_count += 1
        bbox_widths.append(bbox_width)
        bbox_heights.append(bbox_height)
    return image_file, image_width, image_height, bbox_count, bbox_widths, bbox_heights

# Function to create the dataframe for a given subset (train, valid, test)

def create_subset_dataframe(image_files, label_files, images_dir, labels_dir):
    data = []
    for image_file, label_file in zip(image_files, label_files):
        image_info = get_image_and_bboxes_info(image_file, label_file, images_dir, labels_dir)
        data.append(image_info)

   # Create dataframe with columns: filename, width, height, bbox_count, avg_bbox_width, avg_bbox_height
    df = pd.DataFrame(data, columns=[
        'filename', 'image_width', 'image_height', 'bbox_count', 'bbox_widths', 'bbox_heights'
    ])
    # Calculate average bounding box size for each image
    df['avg_bbox_width'] = df['bbox_widths'].apply(lambda x: sum(x) / len(x) if len(x) > 0 else 0)
    df['avg_bbox_height'] = df['bbox_heights'].apply(lambda x: sum(x) / len(x) if len(x) > 0 else 0)

    return df

In [13]:
train_df = create_subset_dataframe(train_image_files, train_label_files, train_images_dir, train_labels_dir)


In [14]:
print(len(train_image_files))


3779


In [15]:
print("Train Dataset:")
print(train_df.head())

Train Dataset:
                                            filename  image_width  \
0  0_wny3n8ot_jpg.rf.1f2df7789afda614056522ea95e8...          640   
1  0_wny3n8ot_jpg.rf.2e107e048b63fdfabebb0e69def3...          640   
2  0_wny3n8ot_jpg.rf.a686302d18f8466853f7c8cfd830...          640   
3  2021_04_07_19_24_6328_2021_04_07_Ulnar_Fractur...          640   
4  2021_04_07_19_24_6328_2021_04_07_Ulnar_Fractur...          640   

   image_height  bbox_count               bbox_widths            bbox_heights  \
0           640           1              [0.20859375]            [0.12890625]   
1           640           1               [0.1421875]              [0.215625]   
2           640           1               [0.1578125]             [0.2234375]   
3           640           2  [0.16796875, 0.16796875]  [0.2140625, 0.1578125]   
4           640           2        [0.19375, 0.19375]  [0.2328125, 0.1859375]   

   avg_bbox_width  avg_bbox_height  
0        0.208594         0.128906  
1        

In [18]:
from tabulate import tabulate

# Display the DataFrame in table format
print("Train Dataset:")
print(tabulate(train_df.tail(5), headers='keys', tablefmt='pretty'))



Train Dataset:
+------+------------------------------------------------------------------------------------------------+-------------+--------------+------------+--------------+--------------+----------------+-----------------+
|      |                                            filename                                            | image_width | image_height | bbox_count | bbox_widths  | bbox_heights | avg_bbox_width | avg_bbox_height |
+------+------------------------------------------------------------------------------------------------+-------------+--------------+------------+--------------+--------------+----------------+-----------------+
| 3774 | xray-of-a-hand-of-a-patient-showing-fractured-bone_jpg.rf.7a9f1d579488d1994d32cce9cfa2bee4.jpg |     640     |     640      |     1      | [0.05859375] | [0.1265625]  |   0.05859375   |    0.1265625    |
| 3775 | xray-of-a-hand-of-a-patient-showing-fractured-bone_jpg.rf.c45694260f8f5b8ad7db7cee4d474d2a.jpg |     640     |     640      