In [6]:
import os
import nrrd
import shutil
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split

excel_file_path = "/kaggle/input/lung-tumor-ds/dataset_lung.xlsx" 
train_folder = "/kaggle/input/lung-tumor-ds/Train" 

output_dir = "/kaggle/working/lung-ds"
output_folder_full_slice = os.path.join(output_dir, "Full_slice")
output_folder_nodule = os.path.join(output_dir, "Nodule")

df = pd.read_excel(excel_file_path)

df['TumorClass'] = df['TumorClass'] - 1

def convert_nrrd_to_image(nrrd_path, save_path):
    # Load the NRRD file
    data, _ = nrrd.read(nrrd_path)
    
    # Normalize and preprocess the data
    data = (data - np.min(data)) / (np.max(data) - np.min(data)) * 255  # Normalize to [0, 255]
    data = data.astype(np.uint8)  # Convert to uint8
    data_rgb = np.stack([data] * 3, axis=-1)  # Convert grayscale to RGB if needed

    # Save as PNG using PIL
    image = Image.fromarray(data_rgb)
    image.save(save_path)

# Function to create ImageFolder-style dataset with train/val split
def create_imagefolder_dataset_with_split(df, image_column, label_column, output_folder, source_folder, train_ratio=0.8):
    # Create train and val subfolders inside the dataset folder
    train_folder = os.path.join(output_folder, "train")
    val_folder = os.path.join(output_folder, "val")
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(val_folder, exist_ok=True)
    
    # Group by label and split into train and val sets for each label
    for label in df[label_column].unique():
        label_data = df[df[label_column] == label]
        train_data, val_data = train_test_split(label_data, test_size=1-train_ratio, random_state=42)
        
        # Create label subfolders inside train and val folders
        train_label_folder = os.path.join(train_folder, str(label))
        val_label_folder = os.path.join(val_folder, str(label))
        os.makedirs(train_label_folder, exist_ok=True)
        os.makedirs(val_label_folder, exist_ok=True)
        
        # Copy images into the appropriate train/val label folder
        for _, row in train_data.iterrows():
            nrrd_name = row[image_column]
            source_path = os.path.join(source_folder, nrrd_name)
            dest_path = os.path.join(train_label_folder, nrrd_name.replace('.nrrd', '.png'))  # Convert to PNG
            if os.path.exists(source_path):
                convert_nrrd_to_image(source_path, dest_path)
            else:
                print(f"Warning: {source_path} does not exist.")
        
        for _, row in val_data.iterrows():
            nrrd_name = row[image_column]
            source_path = os.path.join(source_folder, nrrd_name)
            dest_path = os.path.join(val_label_folder, nrrd_name.replace('.nrrd', '.png'))  # Convert to PNG
            if os.path.exists(source_path):
                convert_nrrd_to_image(source_path, dest_path)
            else:
                print(f"Warning: {source_path} does not exist.")
    
    print(f"Train/val split dataset created in {output_folder}")

# Create Full_slice dataset with train/val split
create_imagefolder_dataset_with_split(
    df=df,
    image_column="Full_slice",
    label_column="TumorClass",
    output_folder=output_folder_full_slice,
    source_folder=train_folder
)

# Create Nodule dataset with train/val split
create_imagefolder_dataset_with_split(
    df=df,
    image_column="Nodule",
    label_column="TumorClass",
    output_folder=output_folder_nodule,
    source_folder=train_folder
)

# Display final folder structure for confirmation
print(f"Full_slice dataset saved to: {output_folder_full_slice}")
print(f"Nodule dataset saved to: {output_folder_nodule}")


  data = (data - np.min(data)) / (np.max(data) - np.min(data)) * 255  # Normalize to [0, 255]


Train/val split dataset created in /kaggle/working/lung-ds/Full_slice
Train/val split dataset created in /kaggle/working/lung-ds/Nodule
Full_slice dataset saved to: /kaggle/working/lung-ds/Full_slice
Nodule dataset saved to: /kaggle/working/lung-ds/Nodule


In [7]:
import shutil
shutil.make_archive("lung-ds", 'zip', output_dir)

'/kaggle/working/lung-ds.zip'