In [None]:
import os
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import SimpleITK as sitk
from sklearn.decomposition import PCA
import glob
import cv2
import umap
from sklearn.manifold import TSNE
import tifffile as tiff
from PIL import Image

In [None]:
def readFilePixels(image_folders=[], mask_folders=[]):
    
    # initialise the list of data and labels
    data = []
    labels = []

    # iterate the image folders and mask folders
    for i, (image_folder, mask_folder) in enumerate(zip(image_folders,mask_folders)):
        one_folder_images = [f for f in os.listdir(image_folder) if f.endswith((".jpg", ".png"))]
        one_folder_masks = [f for f in os.listdir(mask_folder) if f.endswith((".jpg", ".png"))]
        for j, (image_file, mask_file) in enumerate(zip(one_folder_images, one_folder_masks)):
            image_path = os.path.join(image_folder, image_file)
            mask_path = os.path.join(mask_folder,mask_file)
            image = Image.open(image_path)
            mask = Image.open(mask_path)
            image_array = np.array(image)
            mask_array = np.array(mask)
            resize_image = cv2.resize(image_array, (224, 224))
            resize_mask = cv2.resize(mask_array, (224, 224), interpolation=cv2.INTER_NEAREST)
            resize_mask = np.expand_dims(resize_mask, axis=2)
            final_image = resize_image * resize_mask
            final_image = np.transpose(final_image).flatten()
            data.append(final_image)
            labels.append(i)
    
    return data,labels

def visualisation2d(reduced_data, labels, colour, fig_path):
    # 2d visualisation

    for i in range(len(labels)):
        plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c=colour[labels[i]],linewidths=0.001)

    # plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels)
    plt.colorbar()
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    fig_path = fig_path
    plt.savefig(fig_path)
    plt.show()

def visualisation3d(reduced_data, labels, colour, fig_path):
    # 3d visualisation
    fig = plt.figure()
    ax = fig.gca()

    for i in range(len(labels)):
        ax.scatter(reduced_data[i, 0], reduced_data[i, 1], reduced_data[i, 2], c=colour[labels[i]],linewidths=0.001)

    ax.set_xlabel("X")  # 设置X轴标签
    ax.set_ylabel("Y")  # 设置Y轴标签
    # 添加Z轴标签
    ax.text2D(0.05, 0.95, "Z", transform=ax.transAxes)
    fig_path = fig_path
    plt.savefig(fig_path)
    plt.show()

def normalisation_std(data):
    # normalisation
    data = np.array(data)
    data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
    return data

def PCA_reduce(data,components=3):
    # Using PCA to reduce the dimensions
    pca = PCA(n_components=components)
    reduced_data = pca.fit_transform(data)

    return reduced_data

def UMAP_reduce(data,components=3):
    reducer = umap.UMAP(min_dist=0.1, n_components=components)
    reduced_data = reducer.fit_transform(data)
    return reduced_data

def TSNE_reduce(data,components=3):
    tsne = TSNE(n_components=components)
    reduced_data = tsne.fit_transform(data)
    return reduced_data

def select_reduce_methods(data, method = "PCA", component = 3):
    data_array = np.array(data)  # Convert the list to a NumPy array
    if method == "PCA":
        reduced_data = PCA_reduce(data=data_array,components=component)
    if method == "UMAP":
        reduced_data = UMAP_reduce(data=data_array,components=component)
    if method == "TSNE":
        reduced_data = TSNE_reduce(data=data_array,components=component)
    return reduced_data
        

In [None]:
# the folder path of images
img_base_folder = r'./data/HAM10000/train/images/'
# the folder path of masks
mask_base_folder = r'./data/HAM10000/train/masks/'

image_folders = []     # the list of folders
mask_folders = []      # the list of masks

# categories
# categories = ['df','vasc']
categories = ['df','vasc','akiec','bcc','bkl','mel','nv']

for i,category in enumerate(categories):
    image_folder_category = os.path.join(img_base_folder,category)
    mask_folder_category = os.path.join(mask_base_folder,category)
    image_folders.append(image_folder_category)
    mask_folders.append(mask_folder_category)

# iterate every folders for reading every images
data, labels = readFilePixels(image_folders=image_folders, mask_folders=mask_folders)

# components for compressing dimension
component = 3

# PCA, UMAP, TSNE
method = "TSNE"
reduced_data = select_reduce_methods(data=data, method=method, component=component)

# colours: blue, yellow, red
colour = ['#444693', '#f47920','#f05b72','#4e72b8','#fedcbd','#f8aba6','#DDF2C6']

# figure saving path
fig_path = './dataDistribution/'
for category in categories:
    fig_path += category
    fig_path += '_'

# visulisation2d
visualisation2d(reduced_data=reduced_data, labels=labels, colour=colour, fig_path=fig_path+method+"_2d")

# visualisation3d
# visualisation3d(reduced_data=reduced_data, labels=labels, colour=colour, fig_path=fig_path+method+"_3d")