In [19]:
import os
import nrrd
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import shutil

In [3]:
excel_file_path = "/kaggle/input/lung-tumor-ds/dataset_lung.xlsx" 
train_folder = "/kaggle/input/lung-tumor-ds/Train" 

output_dir = "/kaggle/working/lung-ds"
output_folder_full_slice = os.path.join(output_dir, "Full_slice")
output_folder_nodule = os.path.join(output_dir, "Nodule")

In [43]:
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)  

In [25]:
df = pd.read_excel(excel_file_path)
df['TumorClass'] = df['TumorClass'] - 1

In [26]:
def balance_classes(df, label_column):
    class_counts = df[label_column].value_counts()
    print("Original class distribution:")
    print(class_counts)

    # Determine thresholds
    max_target = int(class_counts.mean())  # Downsample majority classes to mean size
    min_target = int(class_counts.mean())  # Upsample minority classes to mean size

    dfs = []

    for label in df[label_column].unique():
        label_data = df[df[label_column] == label]
        current_count = len(label_data)

        if current_count > max_target:
            # Downsample majority class
            downsampled_data = resample(
                label_data,
                replace=False,  # Sample without replacement
                n_samples=max_target,
                random_state=42
            )
            dfs.append(downsampled_data)

        elif current_count < min_target:
            # Upsample minority class
            upsampled_data = resample(
                label_data,
                replace=True,  # Sample with replacement
                n_samples=min_target,
                random_state=42
            )
            dfs.append(upsampled_data)

        else:
            # Keep as is
            dfs.append(label_data)

    # Combine balanced data
    df_balanced = pd.concat(dfs)

    # Shuffle the dataset
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    # Verify new distribution
    print("\nBalanced class distribution:")
    print(df_balanced[label_column].value_counts())

    return df_balanced


In [42]:
def get_unique_filename(save_path):
    base, ext = os.path.splitext(save_path)
    counter = 0
    unique_path = save_path

    while os.path.exists(unique_path):
        unique_path = f"{base}_{counter}{ext}"
        counter += 1

    return unique_path

def convert_nrrd_to_image(nrrd_path, save_path, resize_to=None):
    data, header = nrrd.read(nrrd_path)

    # Clip and normalize the data
    hu_min, hu_max = -1000, 400
    data_clipped = np.clip(data, hu_min, hu_max)
    normalized = (data_clipped - hu_min) / (hu_max - hu_min)

    # Convert to grayscale and then RGB
    grayscale_image = (normalized * 255).astype(np.uint8)
    rgb_image = np.stack([grayscale_image] * 3, axis=-1)

    # Create a PIL image
    pil_image = Image.fromarray(rgb_image)

    # Resize if needed
    if resize_to is not None:
        pil_image = pil_image.resize(resize_to)

    # Ensure the save path is unique
    unique_save_path = get_unique_filename(save_path)

    # Save the image
    pil_image.save(unique_save_path)

In [28]:
def create_imagefolder_dataset_with_split(df, image_column, label_column, output_folder, source_folder, train_ratio=0.8, resize_to=None):
    train_folder = os.path.join(output_folder, "train")
    val_folder = os.path.join(output_folder, "val")      
    
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(val_folder, exist_ok=True)

    # Group by label and split
    for label in df[label_column].unique():
        label_data = df[df[label_column] == label]

        train_data, val_data = train_test_split(label_data, test_size=1-train_ratio, random_state=42)

        # Create label subfolders
        train_label_folder = os.path.join(train_folder, str(label))
        val_label_folder = os.path.join(val_folder, str(label))
        os.makedirs(train_label_folder, exist_ok=True)
        os.makedirs(val_label_folder, exist_ok=True)

        print(train_data.shape)

        counterTrain = 0
        counterVal = 0

        # Save images to respective folders
        for _, row in train_data.iterrows():
            nrrd_name = row[image_column]
            source_path = os.path.join(source_folder, nrrd_name)
            dest_path = os.path.join(train_label_folder, nrrd_name.replace('.nrrd', '.png'))
            convert_nrrd_to_image(source_path, dest_path, resize_to=resize_to)
            counterTrain += 1

        for _, row in val_data.iterrows():
            nrrd_name = row[image_column]
            source_path = os.path.join(source_folder, nrrd_name)
            dest_path = os.path.join(val_label_folder, nrrd_name.replace('.nrrd', '.png'))
            convert_nrrd_to_image(source_path, dest_path, resize_to=resize_to)
            counterVal += 1

        print(counterTrain)
        print(counterVal)

    print(f"Train/val split dataset created in {output_folder}")

In [44]:
df_balanced = balance_classes(df, label_column="TumorClass")

create_imagefolder_dataset_with_split(
    df=df_balanced,
    image_column="Full_slice",
    label_column="TumorClass",
    output_folder=output_folder_full_slice,
    source_folder=train_folder
)

create_imagefolder_dataset_with_split(
    df=df_balanced,
    image_column="Nodule",
    label_column="TumorClass",
    output_folder=output_folder_nodule,
    source_folder=train_folder,
    resize_to=(96, 96)
    
)

Original class distribution:
TumorClass
2    1092
1     457
3     418
0     244
4     152
Name: count, dtype: int64

Balanced class distribution:
TumorClass
1    472
0    472
2    472
4    472
3    472
Name: count, dtype: int64
(377, 3)
377
95
(377, 3)
377
95
(377, 3)
377
95
(377, 3)
377
95
(377, 3)
377
95
Train/val split dataset created in /kaggle/working/lung-ds/Full_slice
(377, 3)
377
95
(377, 3)
377
95
(377, 3)
377
95
(377, 3)
377
95
(377, 3)
377
95
Train/val split dataset created in /kaggle/working/lung-ds/Nodule


In [45]:
def count_elements_in_folder(folder_path):
    try:
        # List all elements in the folder
        elements = os.listdir(folder_path)
        
        # Count the elements
        count = len(elements)
        print(f"The folder '{folder_path}' contains {count} elements.")
        return count
    except FileNotFoundError:
        print(f"The folder '{folder_path}' does not exist.")
        return 0
    except Exception as e:
        print(f"An error occurred: {e}")
        return 0

In [49]:
import shutil
shutil.make_archive("lung-ds", 'zip', output_dir)

'/kaggle/working/lung-ds.zip'