In [None]:
import os
import numpy as np
import pandas as pd
import shutil
import csv
import matplotlib.pyplot as plt 

In [None]:
def dicom_dimensions(input_folder):
    """
    Gets the dicom image dimensions
    
    Args:
        input_folder(str): A string containing the dicom folder location
    
    Returns:
        pandas Dataframe: Dataframe containing DICOM image dimensions
    """
    
    
    # Create a DataFrame to store the dimensions
    df = pd.DataFrame(columns=['image_id', 'dicom_width', 'dicom_height'])

    # Iterating over all DICOM files in the folder
    for imagename in os.listdir(input_folder):
        if imagename.endswith('.dcm'):
            image = os.path.join(input_folder, imagename)
            dicom_image = pydicom.dcmread(image)
            width = dicom_image.Rows
            height = dicom_image.Columns

            df = df.append({'image_id': imagename[:-4], 'dicom_width': width, 'dicom_height': height}, ignore_index=True)

    return df

In [None]:
def dicom_to_png_conversion(source_folder, target_folder):
    """
    Converts DICOM files in the source folder to .png image and saves them in a new folder
    Also creates a dataframe with PNG image dimensions

    Args:
        source_folder (str): DICOM images folder location
        target_folder (str): PNG images storage location.
    Returns:
        pandas Dataframe: Dataframe with png image dimensions
    """
    
    df = pd.DataFrame(columns=['image_id', 'png_width', 'png_height'])

    # Iterating over all DICOM files in the folder
    for imagename in os.listdir(source_folder):
        if imagename.endswith('.dcm'):
            dicom_image = pydicom.dcmread(os.path.join(source_folder, imagename))

            #Aspect ratio calculation
            aspect_ratio = float((dicom_image.Rows)/(dicom_image.Columns))
            

            if aspect_ratio >= 1:
                width = 256
                height = int(256/aspect_ratio)
            else:
                width = int(256*aspect_ratio)
                height = 256
                
            png_object = Image.fromarray(dicom_image.pixel_array)
            
            # Image resize
            png_object = pnj_object.resize(width,height)
                
            # Store the image as a PNG file
            png_imagename = os.path.splitext(imagename)[0] + '.png'
            png_object.save(os.path.join(target_folder, png_imagename))
        
            df = df.append({'image_id': imagename[:-4], 'png_width': width, 'png_height': height}, ignore_index=True)

    return df

In [None]:
def train_test_val_split(df,source_folder_path,target_folder_path_images,target_folder_path_labels):
    """
    Splits the dataframe into train, test and validation dataframes and copies the images and creates
    labels for YOLO model training
    
    Args:
        df (dataframe): Unique image dataframe
        source_folder_path (list(str)): PNG image storage location after splitting dataframe
        target_folder_path (list(str)): labels storage location
    """
    train_percentile = 0.8
    val_percentile = 0.1
    test_percentile = 0.1
    
    rows,cols=df.shape
    
    train_rows=int(train_percentile*rows)
    val_rows=int(val_percentile*rows)
    test_rows=rows-train_rows-val_rows
    
    #Shuffling the dataframe before splitting it
    df = df.sample(frac=1, random_state=50)
    
    train_df = df.iloc[:train_rows]
    val_df = df.iloc[train_rows:train_rows+val_rows]
    test_df = df.iloc[train_rows+val_rows:]
    
    #Getting unique image id from train, test and validation dataframes as lists
    train_df_unique=train_df['image_id'].unique().tolist()
    val_df_unique=val_df['image_id'].unique().tolist()
    test_df_unique=test_df['image_id'].unique().tolist()
    
    #Copying the images to a new location
    for image in os.listdir(source_folder_path):
        if image.endswith('.png') and image[:-4] in train_df_unique:
            source_file=os.path.join(source_folder_path, image)
            target_file=os.path.join(target_folder_path[1], image)
            shutil.copy(source_file, target_file)
    
    for image in os.listdir(source_folder_path):
        if image.endswith('.png') and image[:-4] in val_df_unique:
            source_file=os.path.join(source_folder_path, image)
            target_file=os.path.join(target_folder_path[1], image)
            shutil.copy(source_file, target_file)
            
    for image in os.listdir(source_folder_path):
        if image.endswith('.png') and image[:-4] in test_df_unique:
            source_file=os.path.join(source_folder_path, image)
            target_file=os.path.join(target_folder_path[2], image)
            shutil.copy(source_file, target_file)
    
    #Creating labels for split dataframe
    # Iterate over unique image ids
    for image in train_df_unique:
        image_df = train_df[train_df['image_id'] == image]
        image_rows,image_cols=image_df.shape

        # Create a txt file with the same name as the image id and write the data to it
        with open(f'{target_folder_path_lables[0]}//{image}.txt', 'w') as file:
            for i in range(image_rows):
                file.write(f"{image_df.iloc[i]['class_id']} {image_df.iloc[i]['x_mid']} {image_df.iloc[i]['y_mid']} {image_df.iloc[i]['bbox_width']} {image_df.iloc[i]['bbox_height']}\n")
    
    # Iterate over unique image ids
    for image in val_df_unique:
        image_df = val_df[val_df['image_id'] == image]
        image_rows,image_cols=image_df.shape
    
        with open(f'{target_folder_path_labels[1]}//{image}.txt', 'w') as file:
            for i in range(image_rows):
                file.write(f"{image_df.iloc[i]['class_id']} {image_df.iloc[i]['x_mid']} {image_df.iloc[i]['y_mid']} {image_df.iloc[i]['bbox_width']} {image_df.iloc[i]['bbox_height']}\n")


Create the DICOM image dimensions dataframe

In [None]:
dicom_folder_path = '/Users/cibhi/AI Assessment/Coding/Dicom'
dicom = dicom_dimensions(dicom_folder_path)

Create the PNG image dimensions dataframe after DICOM to PNG conversion

In [None]:
png_folder_path = '/Users/cibhi/AI Assessment/Coding/PNG'
png = dicom_to_png_conversion(dicom_folder_path, png_folder_path)

Read the train dataset

In [None]:
df=pd.read_csv('train.csv')

In [None]:
# PLotting the class identifications per class

unique = df['class_id'].value_counts().sort_index()

fig, ax = plt.subplots(figsize=(8, 6))

ax = unique.plot(kind='bar')
ax.set_xlabel('Class ID')
ax.set_ylabel('Number of identifications')



unique.plot(kind='bar')
unique = unique.sort_index()

for i, j in enumerate(unique.values):
    ax.text(i, j + 1, str(j), ha='center')

plt.show()

Merge DICOM dimensions and PNG dimensions to the train dataframe

In [None]:
merged=pd.merge(df,dicom,on='image_id')
merged=pd.merge(merged,png,on='image_id')

Calculate scaling factor to resize bounding box parameters and calculate the bounding box for YOLO

In [None]:
merged['Scale_y'] = merged['png_height'] / merged['dicom_height']
merged['Scale_x'] = merged['png_width'] / merged['dicom_width']

In [None]:
merged['new_x_min'] = merged.apply(lambda row: ((row.x_min)*(row.Scale_x))/(row.png_width) if row.class_id != 14 else 0, axis=1)
merged['new_y_min'] = merged.apply(lambda row: ((row.y_min)*(row.Scale_y))/(row.png_height) if row.class_id != 14 else 0, axis=1)
merged['new_x_max'] = merged.apply(lambda row: ((row.x_max)*(row.Scale_x))/(row.png_width) if row.class_id != 14 else 1, axis=1)
merged['new_y_max'] = merged.apply(lambda row: ((row.y_max)*(row.Scale_y))/(row.png_height) if row.class_id != 14 else 1, axis=1)

In [None]:
#Calculate bounding box parameters for YOLO model
merged['x_mid'] = merged.apply(lambda row: (row.new_x_min+row.new_x_max)/2, axis=1)
merged['y_mid'] = merged.apply(lambda row: (row.new_y_min+row.new_y_max)/2, axis=1)
merged['bbox_width'] = merged.apply(lambda row: (row.new_x_max-row.new_x_min), axis=1)
merged['bbox_height'] = merged.apply(lambda row: (row.new_y_max-row.new_y_min), axis=1)

Selecting unique images for evely distributed class identifications

In [None]:
df_unique = (merged.groupby('class_id')
                 .apply(lambda x: x.drop_duplicates().head(600))
                 .reset_index(drop=True)
                 .drop_duplicates()
                 .head(9000))

In [None]:
target_folder_path_images=['/Users/cibhi/AI Assessment/Coding/Yolov8_600/Train/images','/Users/cibhi/AI Assessment/Coding/Yolov8_600/Validation/images','/Users/cibhi/AI Assessment/Coding/Yolov8_600/Test/images']

In [None]:
target_folder_path_labels=['/Users/cibhi/AI Assessment/Coding/Yolov8_600/Train/labels','/Users/cibhi/AI Assessment/Coding/Yolov8_600/Validation/labels']

Splitting the dataframe into train,test and validation datasets for YOLO model training

In [None]:
train_test_val_split(df_unique,png_folder_path,target_folder_path_images,target_folder_path_labels)