数据集的下载地址：

https://storage.googleapis.com/kaggle-data-sets/767686/1327578/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240508%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240508T153010Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=418abc7b422f6a0b237534f13cae574b50631e0115a4caa9bfa400c6046b500bb9a4c4362aaddfc3683dcd630e56b25517899d1a9b1930127a29555dbb1916f5822b57f69faa9bf2027337dca53938fde059b55e108cc777810632c7983f04f686c63e67ffe1dcbe1f6b43c427d8dfbc1496b33fb8a1722ca48e05e66e782962ae74fc68c6c4074d95d1b5064abde1a285c7da08181142e67b9ad6c1cad2f2d88cfd4a12aa48667a198ae0a0cfa51d145d9d2e149fd9c8a223862ddaec191e545e43f1c5e861c2be09e7d6e4d165a13f00628e75a69c9e6f42924fa0a17fb00f62a897219bacf178cc8aef4cce2d69de61436229317e5c87fd6dbbdc26202efe

In [None]:
import os

# 获取掩码文件地址
mask_folder = "./segmentations/"
mask_nii_list = [os.path.join(mask_folder, f) for f in os.listdir(mask_folder) if f.endswith(".nii")]

# 获取图像文件地址
image_folders = [
    "./volume_pt1/",
    "./volume_pt2/",
    "./volume_pt3/",
    "./volume_pt4/",
    "./volume_pt5/"
]
image_nii_list = []
for folder in image_folders:
    image_nii_list.extend([os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nii")])

# 打印结果
print("Mask NIfTI files length:")
print(len(mask_nii_list))
print("\nImage NIfTI files length:")
print(len(image_nii_list))

In [None]:
import os

# 根据 image_nii_list 匹配 mask nii
matched_mask_nii_list = []
for image_path in image_nii_list:
    image_name = os.path.basename(image_path).split(".")[0]  # 获取文件名 (例如: volume-0)
    mask_name = "segmentation-" + image_name.split("-")[1] + ".nii"  # 构造对应的掩码文件名
    mask_path = os.path.join(mask_folder, mask_name)
    if os.path.exists(mask_path):
        matched_mask_nii_list.append(mask_path)
    else:
        print(f"Warning: Mask not found for image {image_path}")

# 更新 mask_nii_list
mask_nii_list = matched_mask_nii_list

# 打印结果
print("\nMatched Mask NIfTI files length:")
print(len(mask_nii_list))

In [None]:
import os

# 对列表进行排序
mask_nii_list.sort()
image_nii_list.sort()

# 打印排序后的结果
print("\nSorted Mask NIfTI files:")
print(mask_nii_list)
print("\nSorted Image NIfTI files:")
print(image_nii_list)

In [None]:
import os
import nibabel as nib
import numpy as np
from tqdm import tqdm

# 创建输出文件夹
output_image_folder = "/kaggle/working/image"
output_mask_folder = "/kaggle/working/mask"
os.makedirs(output_image_folder, exist_ok=True)
os.makedirs(output_mask_folder, exist_ok=True)

# 匹配图像和掩码 NIfTI 文件
image_mask_pairs = []
for image_path in tqdm(image_nii_list):
    image_name = os.path.basename(image_path).split(".")[0]  # 获取图像文件名 (例如: volume-0)
    mask_name = "segmentation-" + image_name.split("-")[1] + ".nii"  # 构造对应的掩码文件名
    mask_path = os.path.join(mask_folder, mask_name)
    if os.path.exists(mask_path):
        image_mask_pairs.append((image_path, mask_path))
    else:
        print(f"Warning: Mask not found for image {image_path}")

# 处理图像和掩码 NIfTI 文件对
for image_path, mask_path in tqdm(image_mask_pairs):
    image_name = os.path.basename(image_path).split(".")[0]  # 获取图像文件名 (例如: volume-0)
    mask_name = os.path.basename(mask_path).split(".")[0]  # 获取掩码文件名 (例如: segmentation-0)

    # 加载 NIfTI 文件
    image_data = nib.load(image_path).get_fdata()
    mask_data = nib.load(mask_path).get_fdata()

    # 将切片保存为 NPZ 文件
    for i in tqdm(range(image_data.shape[2])):
        image_slice_name = f"{image_name}_slice_{i}.npz"
        mask_slice_name = f"{mask_name}_slice_{i}.npz"
        np.savez_compressed(os.path.join(output_image_folder, image_slice_name), arr_0=image_data[:, :, i])
        np.savez_compressed(os.path.join(output_mask_folder, mask_slice_name), arr_0=mask_data[:, :, i])

In [None]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt

# 获取随机的图像和掩码文件对
image_folder = "/kaggle/working/image"
mask_folder = "/kaggle/working/mask"
image_files = [f for f in os.listdir(image_folder) if f.endswith(".npz")]
mask_files = [f for f in os.listdir(mask_folder) if f.endswith(".npz")]

# 确保图像和掩码文件数量匹配
if len(image_files) != len(mask_files):
    raise ValueError("Image and mask file counts do not match!")

# 随机选择一个图像文件
random_image_file = random.choice(image_files)
image_base_name = random_image_file.split("_slice_")[0]  # 提取图像基本名称
mask_file = f"segmentation-{image_base_name.split('-')[1]}_slice_{random_image_file.split('_slice_')[1]}"  # 构造对应的掩码文件名
mask_path = os.path.join(mask_folder, mask_file)

# 检查掩码文件是否存在
if os.path.exists(mask_path):
    image_path = os.path.join(image_folder, random_image_file)
    
print(image_path)
print(mask_path)

image_data = np.load(image_path)["arr_0"]
mask_data = np.load(mask_path)["arr_0"]

# 获取信息
resolution = image_data.shape
mask_labels = np.unique(mask_data)
image_max = image_data.max()
image_min = image_data.min()
mask_max = mask_data.max()
mask_min = mask_data.min()

# 显示图像和掩码
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.imshow(image_data, cmap="gray")  # 显示中间切片
plt.title("Image")
plt.subplot(1, 2, 2)
plt.imshow(mask_data, cmap="gray")  # 显示中间切片
plt.title("Mask")
plt.show()

# 打印信息
print("Resolution:", resolution)
print("Image Max/Min:", image_max, image_min)

print("Mask Labels:", mask_labels)
print("Mask Max/Min:", mask_max, mask_min)

In [None]:
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

mask_folder = "/kaggle/working/mask"
mask_files = [f for f in os.listdir(mask_folder) if f.endswith(".npz")]

# 统计不同标签组合的切片数量
label_counts = {
    "0 (Background)": 0,
    "1 (Liver)": 0,
    "0/1/2 (Plus with Tumor)": 0,
}

for mask_file in tqdm(mask_files):
    mask_data = np.load(os.path.join(mask_folder, mask_file))["arr_0"]
    unique_labels = np.unique(mask_data)
    if set(unique_labels) == {0.}:
        label_counts["0 (Background)"] += 1
    elif set(unique_labels) == {0., 1.}:
        label_counts["1 (Liver)"] += 1
#         print('bingo')
    elif set(unique_labels) == {0., 1., 2.}:
        label_counts["0/1/2 (Plus with Tumor)"] += 1
    else:
        print(f"Warning: Unexpected labels in {mask_file}: {unique_labels}")

# 使用 Seaborn 创建直方图
# sns.set_theme(style="whitegrid")
plt.figure(figsize=(8, 6))
ax = sns.barplot(x=list(label_counts.keys()), y=list(label_counts.values()))
ax.set_xlabel("Label Combinations")
ax.set_ylabel("Number of Slices")
ax.set_title("Distribution of Label Combinations in Mask Slices")

# 在每个柱形图顶部添加数值标签
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2., height + 3, f"{height}", ha="center")

plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt

mask_folder = "/kaggle/working/mask"
target_file = "segmentation-39_slice_12.npz"
target_path = os.path.join(mask_folder, target_file)

if os.path.exists(target_path):
    # 加载掩码数据
    mask_data = np.load(target_path)["arr_0"]

    # 显示掩码
    plt.figure(figsize=(8, 6))
    plt.imshow(mask_data, cmap="gray")
    plt.title(f"Mask: {target_file}")
    plt.show()

    # 打印标签信息
    unique_labels = np.unique(mask_data)
    print("Unique Labels:", unique_labels)
else:
    print(f"Error: File not found: {target_path}")

In [None]:
import os

mask_folder = "/kaggle/working/mask"
image_folder = "/kaggle/working/image"

# 要删除的掩码文件列表
mask_files_to_delete = [
    "segmentation-39_slice_10.npz",
    "segmentation-39_slice_9.npz",
    "segmentation-39_slice_14.npz",
    "segmentation-1_slice_73.npz",
    "segmentation-39_slice_11.npz",
    "segmentation-39_slice_13.npz",
    "segmentation-39_slice_12.npz",
]

for mask_file in mask_files_to_delete:
    mask_path = os.path.join(mask_folder, mask_file)
    if os.path.exists(mask_path):
        os.remove(mask_path)  # 删除掩码文件
        image_base_name = mask_file.split("_slice_")[0]  # 提取图像基本名称
        image_file = f"volume-{image_base_name.split('-')[1]}_slice_{mask_file.split('_slice_')[1]}"  # 构造对应的图像文件名
        image_path = os.path.join(image_folder, image_file)
        if os.path.exists(image_path):
            os.remove(image_path)  # 删除对应的图像文件
        else:
            print(f"Warning: Image not found for mask {mask_file}")
    else:
        print(f"Warning: Mask not found: {mask_file}")

In [None]:
import os
import random
import numpy as np
import shutil
from tqdm import tqdm

mask_files = [f for f in os.listdir(mask_folder) if f.endswith(".npz")]

# 找出只含有背景的掩码及其对应的图像
background_mask_files = []
for mask_file in tqdm(mask_files):
    mask_data = np.load(os.path.join(mask_folder, mask_file))["arr_0"]
    unique_labels = np.unique(mask_data)
    if set(unique_labels) == {0.}:
        background_mask_files.append(mask_file)



In [None]:
# 随机取样 2500 张背景掩码
num_to_keep = 2500
if len(background_mask_files) > num_to_keep:
    background_mask_files_to_keep = random.sample(background_mask_files, num_to_keep)
else:
    background_mask_files_to_keep = background_mask_files

# 删除多余的背景掩码及其对应的图像
for mask_file in tqdm(background_mask_files):
    if mask_file not in background_mask_files_to_keep:
        mask_path = os.path.join(mask_folder, mask_file)
        os.remove(mask_path)  # 删除掩码文件
        image_base_name = mask_file.split("_slice_")[0]  # 提取图像基本名称
        image_file = f"volume-{image_base_name.split('-')[1]}_slice_{mask_file.split('_slice_')[1]}"  # 构造对应的图像文件名
        image_path = os.path.join(image_folder, image_file)
        os.remove(image_path)  # 删除对应的图像文件

In [None]:
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

mask_folder = "/kaggle/working/mask"
mask_files = [f for f in os.listdir(mask_folder) if f.endswith(".npz")]

# 统计不同标签组合的切片数量
label_counts = {
    "0 (Background)": 0,
    "1 (Liver)": 0,
    "0/1/2 (Plus with Tumor)": 0,
}

for mask_file in tqdm(mask_files):
    mask_data = np.load(os.path.join(mask_folder, mask_file))["arr_0"]
    unique_labels = np.unique(mask_data)
    if set(unique_labels) == {0.}:
        label_counts["0 (Background)"] += 1
    elif set(unique_labels) == {0., 1.}:
        label_counts["1 (Liver)"] += 1
#         print('bingo')
    elif set(unique_labels) == {0., 1., 2.}:
        label_counts["0/1/2 (Plus with Tumor)"] += 1
    else:
        print(f"Warning: Unexpected labels in {mask_file}: {unique_labels}")

# 使用 Seaborn 创建直方图
# sns.set_theme(style="whitegrid")
plt.figure(figsize=(8, 6))
ax = sns.barplot(x=list(label_counts.keys()), y=list(label_counts.values()))
ax.set_xlabel("Label Combinations")
ax.set_ylabel("Number of Slices")
ax.set_title("Distribution of Label Combinations in Mask Slices")

# 在每个柱形图顶部添加数值标签
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2., height + 3, f"{height}", ha="center")

plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

In [None]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt

# 获取随机的图像和掩码文件对
image_files = [f for f in os.listdir(image_folder) if f.endswith(".npz")]
mask_files = [f for f in os.listdir(mask_folder) if f.endswith(".npz")]

# 随机选择两对图像-掩码文件
num_pairs_to_display = 2
image_mask_pairs = []
for _ in range(num_pairs_to_display):
    random_image_file = random.choice(image_files)
    image_base_name = random_image_file.split("_slice_")[0]  # 提取图像基本名称
    mask_file = f"segmentation-{image_base_name.split('-')[1]}_slice_{random_image_file.split('_slice_')[1]}"  # 构造对应的掩码文件名
    mask_path = os.path.join(mask_folder, mask_file)
    if os.path.exists(mask_path):
        image_path = os.path.join(image_folder, random_image_file)
        image_mask_pairs.append((image_path, mask_path))
    else:
        print(f"Error: Mask not found for image {random_image_file}")

# 显示图像和掩码
for image_path, mask_path in image_mask_pairs:
    image_data = np.load(image_path)["arr_0"]
    mask_data = np.load(mask_path)["arr_0"]
    
    # 获取信息
    resolution = image_data.shape
    mask_labels = np.unique(mask_data)
    image_max = image_data.max()
    image_min = image_data.min()
    mask_max = mask_data.max()
    mask_min = mask_data.min()
    
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.imshow(image_data, cmap="gray")
    plt.title("Image")
    plt.subplot(1, 2, 2)
    plt.imshow(mask_data, cmap="gray")
    plt.title("Mask")
    plt.show()

    # 打印信息
    print("Resolution:", resolution)
    print("Image Max/Min:", image_max, image_min)

    print("Mask Labels:", mask_labels)
    print("Mask Max/Min:", mask_max, mask_min)

In [None]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt
import albumentations as A


# 获取图像和掩码文件
image_files = [f for f in os.listdir(image_folder) if f.endswith(".npz")]
mask_files = [f for f in os.listdir(mask_folder) if f.endswith(".npz")]

# 随机选择一对图像-掩码文件
random_image_file = random.choice(image_files)
image_base_name = random_image_file.split("_slice_")[0]  # 提取图像基本名称
mask_file = f"segmentation-{image_base_name.split('-')[1]}_slice_{random_image_file.split('_slice_')[1]}"  # 构造对应的掩码文件名
mask_path = os.path.join(mask_folder, mask_file)

if os.path.exists(mask_path):
    image_path = os.path.join(image_folder, random_image_file)
    image_data = np.load(image_path)["arr_0"]
    mask_data = np.load(mask_path)["arr_0"]
    
    # 归一化图像到 0-1
    image_data_normalized = (image_data - image_data.min()) / (image_data.max() - image_data.min())
    
    # 转换为 uint8 类型
    image_data_uint8 = (image_data_normalized * 255).astype(np.uint8)
#     # 获取信息
#     resolution = image_data_uint8.shape
#     image_max = image_data_uint8.max()
#     image_min = image_data_uint8.min()
    
#     # 打印信息
#     print("Resolution:", resolution)
#     print("Image Max/Min:", image_max, image_min)

    # 设定不同的 clip_limit 值
    clip_limits = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
    
    # 创建一个图像来展示不同的clip_limit效果
    fig, axes = plt.subplots(len(clip_limits), 3, figsize=(15, 5 * len(clip_limits)))
    if len(clip_limits) == 1:
        axes = [axes]
    
    for i, clip_limit in enumerate(clip_limits):
        # 应用 CLAHE 增强
        clahe = A.CLAHE(clip_limit=clip_limit, tile_grid_size=(8, 8), p=1.0)

        transformed_image_uint8 = clahe(image=image_data_uint8)["image"]
        
        # 转换为浮点数据（不缩放回原始范围）
        transformed_image = transformed_image_uint8.astype(np.float32)
#         transformed_image = np.rot90(transformed_image_uint8.astype(np.float32))
    
        image_max = transformed_image.max()
        image_min = transformed_image.min()

        # 打印信息
        print("Resolution:", resolution)
        print("Image Max/Min:", image_max, image_min)
        
        # 显示原始图像、处理后的图像和掩码
        axes[i][0].imshow(image_data, cmap="gray")
        axes[i][0].set_title("Original Image")
        axes[i][0].axis('off')
        
        axes[i][1].imshow(transformed_image, cmap="gray")
        axes[i][1].set_title(f"CLAHE Enhanced (Clip Limit: {clip_limit})")
        axes[i][1].axis('off')
        
        axes[i][2].imshow(mask_data, cmap="gray")
        axes[i][2].set_title("Mask")
        axes[i][2].axis('off')
    
    plt.tight_layout()
    plt.show()
else:
    print(f"Error: Mask not found for image {random_image_file}")


In [None]:
print(image_path, mask_path)

In [None]:
import os
import numpy as np
import cv2
import albumentations as A
from tqdm import tqdm

# 图像和掩码文件夹
image_folder = "/kaggle/working/image"
mask_folder = "/kaggle/working/mask"

# 目标文件夹
clahe_image_folder = "/kaggle/working/clahe_images"
clahe_mask_folder = "/kaggle/working/clahe_masks"

# 确保目标文件夹存在
os.makedirs(clahe_image_folder, exist_ok=True)
os.makedirs(clahe_mask_folder, exist_ok=True)

# 列出图像文件
image_files = [f for f in os.listdir(image_folder) if f.endswith(".npz")]

# 设置 CLAHE 的 clip_limit 值
clip_limit = 6.5
clahe = A.CLAHE(clip_limit=clip_limit, tile_grid_size=(8, 8), p=1.0)

# 对图像文件进行处理
for idx, image_file in enumerate(tqdm(image_files)):
    # 处理图像文件
    image_path = os.path.join(image_folder, image_file)
    image_data = np.load(image_path)["arr_0"]

    # 归一化到 0-1
    image_data_normalized = (image_data - image_data.min()) / (image_data.max() - image_data.min())
    image_data_uint8 = (image_data_normalized * 255).astype(np.uint8)
    
    # 应用 CLAHE 增强
    transformed_image_uint8 = clahe(image=image_data_uint8)["image"]

    # 应用 NumPy 旋转
    transformed_image_uint8 = np.rot90(transformed_image_uint8)
    transformed_image = transformed_image_uint8.astype(np.float32)

    # 调整尺寸到256x256
    resized_image = cv2.resize(transformed_image_uint8, (256, 256), interpolation=cv2.INTER_NEAREST)

    # 保存增强后的图像到新文件夹
    new_image_path = os.path.join(clahe_image_folder, f"image_{idx}.npz")
    np.savez_compressed(new_image_path, arr_0=resized_image)

    # 根据图像文件名找到对应的掩码文件名
    image_base_name = image_file.split("_slice_")[0]  # 提取图像基本名称
    mask_file = f"segmentation-{image_base_name.split('-')[1]}_slice_{image_file.split('_slice_')[1]}"
    mask_path = os.path.join(mask_folder, mask_file)

    # 处理掩码文件
    if os.path.exists(mask_path):
        mask_data = np.load(mask_path)["arr_0"]
        transformed_mask = np.rot90(mask_data)  # 应用 NumPy 旋转

        # 调整尺寸到256x256
        resized_mask = cv2.resize(transformed_mask, (256, 256), interpolation=cv2.INTER_NEAREST)

        # 保存增强后的掩码到新文件夹
        new_mask_path = os.path.join(clahe_mask_folder, f"mask_{idx}.npz")
        np.savez_compressed(new_mask_path, arr_0=resized_mask)
    else:
        print(f"Error: Mask not found for image {image_file}")


In [None]:
import os

# 目标文件夹路径
clahe_image_folder = "/kaggle/working/clahe_images"
clahe_mask_folder = "/kaggle/working/clahe_masks"

# 获取文件列表并计算数量
image_files_count = len([f for f in os.listdir(clahe_image_folder) if f.endswith(".npz")])
mask_files_count = len([f for f in os.listdir(clahe_mask_folder) if f.endswith(".npz")])

print(f"Number of files in clahe_images folder: {image_files_count}")
print(f"Number of files in clahe_masks folder: {mask_files_count}")


In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt

# 文件夹路径
clahe_image_folder = "/kaggle/working/clahe_images"
clahe_mask_folder = "/kaggle/working/clahe_masks"

# 选择第一个图像文件来进行可视化
image_files = [f for f in os.listdir(clahe_image_folder) if f.endswith(".npz")]
mask_files = [f for f in os.listdir(clahe_mask_folder) if f.endswith(".npz")]

# 确保至少存在一个图像和掩码文件
if image_files and mask_files:
    image_file = image_files[0]  # 选择列表中的第一个图像文件
    mask_file = image_file.replace("image", "mask")  # 替换名称以匹配掩码文件

    # 路径
    image_path = os.path.join(clahe_image_folder, image_file)
    mask_path = os.path.join(clahe_mask_folder, mask_file)

    # 加载数据
    image_data = np.load(image_path)["arr_0"]
    mask_data = np.load(mask_path)["arr_0"]

    # 打印信息
    print(f"Image - Resolution: {image_data.shape}, Max: {image_data.max()}, Min: {image_data.min()}")
    print(f"Mask - Unique Values: {np.unique(mask_data)}, Max: {mask_data.max()}, Min: {mask_data.min()}")

    # 可视化
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.imshow(image_data, cmap='gray')
    plt.title('CLAHE Enhanced Image')
    plt.axis('off')

    plt.subplot(1, 2, 2)
    plt.imshow(mask_data, cmap='gray')
    plt.title('Corresponding Mask')
    plt.axis('off')

    plt.show()
    # 打印信息
    print(f"Image - Resolution: {image_data.shape}, Max: {image_data.max()}, Min: {image_data.min()}")
    print(f"Mask - Unique Values: {np.unique(mask_data)}, Max: {mask_data.max()}, Min: {mask_data.min()}")
else:
    print("No image or mask files found in the specified directories.")
