In [2]:
## 处理metadata表格数据

In [3]:
import os
import pandas as pd

# 指定数据目录路径
path_to_data = 'C:\\Users\\huoqingz\\Desktop\\PAD-UFES-20'

# 读取元数据
metadata = pd.read_csv(os.path.join(path_to_data, 'metadata.csv'))

# 打印前10行数据
print(metadata.head(10))

# 添加图像文件的完整路径到元数据中
metadata['full_link'] = os.path.join(path_to_data, 'Dataprep', 'all_images', '') + metadata['img_id']

# 检查元数据DataFrame中每一列的空值
for column in metadata.columns:
    print(f'{column}: {metadata[column].isnull().sum()}')

# 填充缺失值
metadata = metadata.fillna("EMPTY")

# 定义临床特征和图像相关列
clin_feats = ["smoke", "drink", "background_father", "background_mother", "age", "pesticide", "gender", "skin_cancer_history",
              "cancer_history", "has_piped_water", "has_sewage_system", "fitspatrick", "region", "diameter_1", "diameter_2",
              "itch", "grew", "hurt", "changed", "bleed", "elevation"]

# 与患者信息相关的临床列
clin_ = ["img_id", "diagnostic", "patient_id", "lesion_id", "biopsed"]

# 初始化一个空的字典，用于存储新数据
new_df = {c: [] for c in clin_feats}

# 遍历metadata的每一行
for idx, row in metadata.iterrows():
    for col in clin_feats:
        data_row = row[col]
        if data_row != 'EMPTY':
            new_df[col].append(data_row)
        else:
            new_df[col].append(0)  # 如果缺失值则填充0

# 将字典转换为DataFrame
new_df = pd.DataFrame.from_dict(new_df)

# 添加患者信息相关的临床列到DataFrame中
for col in clin_:
    new_df[col] = metadata[col]

# 保存预处理后的metadata
new_df.to_csv(os.path.join(path_to_data, 'metadata_preprocessed.csv'), index=False)


  patient_id  lesion_id  smoke  drink background_father background_mother  \
0   PAT_1516       1765    NaN    NaN               NaN               NaN   
1     PAT_46        881  False  False         POMERANIA         POMERANIA   
2   PAT_1545       1867    NaN    NaN               NaN               NaN   
3   PAT_1989       4061    NaN    NaN               NaN               NaN   
4    PAT_684       1302  False   True         POMERANIA         POMERANIA   
5   PAT_1549       1882    NaN    NaN               NaN               NaN   
6    PAT_778       1471  False   True           GERMANY             ITALY   
7    PAT_117        179  False  False         POMERANIA         POMERANIA   
8   PAT_1995       4080    NaN    NaN               NaN               NaN   
9    PAT_705       4015  False   True           GERMANY           GERMANY   

   age pesticide  gender skin_cancer_history  ... diameter_2 diagnostic  \
0    8       NaN     NaN                 NaN  ...        NaN        NEV   
1 

In [4]:
## 使用CLIP模型进行图像特征提取

In [6]:
!pip install open_clip_torch
import os
import shutil
from PIL import Image
import warnings
import numpy as np
import pandas as pd
import torch
import open_clip

warnings.filterwarnings('ignore')

# 设置设备
device = "cuda" if torch.cuda.is_available() else "cpu"

# 加载CLIP模型和预处理方法
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='openai')

# 指定数据目录路径
path_to_data = 'C:\\Users\\huoqingz\\Desktop\\PAD-UFES-20'

# 定义目标文件夹
destination_folder = 'all_images'

# 创建目标文件夹
os.makedirs(destination_folder, exist_ok=True)

# 定义一个函数，将文件移动到目标文件夹
def move_files_to_dest_folder(path, dest):
    for root, _, files in os.walk(path):
        for file in files:
            src_file = os.path.join(root, file)
            dest_file = os.path.join(dest, file)
            shutil.copy(src_file, dest_file)

# 将每个子文件夹中的文件移动到目标文件夹
move_files_to_dest_folder(os.path.join(path_to_data, 'images', 'imgs_part_1'), destination_folder)
move_files_to_dest_folder(os.path.join(path_to_data, 'images', 'imgs_part_2'), destination_folder)
move_files_to_dest_folder(os.path.join(path_to_data, 'images', 'imgs_part_3'), destination_folder)

# 打印目标文件夹中的文件数量
print(len(os.listdir(destination_folder)))

# 获取图像文件列表
image_files = [os.path.join(destination_folder, f) for f in os.listdir(destination_folder) if f.endswith('.jpg') or f.endswith('.png')]

# 预处理图像并提取特征
def preprocess_images(image_paths):
    images = []
    for image_path in image_paths:
        try:
            image = Image.open(image_path)
            images.append(preprocess(image).unsqueeze(0))
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
    return torch.cat(images).to(device)

# 批处理特征提取函数
def extract_features(image_files, batch_size=32):
    all_features = []
    num_batches = len(image_files) // batch_size + (1 if len(image_files) % batch_size != 0 else 0)

    for i in range(num_batches):
        batch_files = image_files[i * batch_size:(i + 1) * batch_size]
        images = preprocess_images(batch_files)
        
        with torch.no_grad():
            image_features = model.encode_image(images)
        
        all_features.append(image_features.cpu().numpy())
        print(f"Processed batch {i+1}/{num_batches}")
    
    return np.concatenate(all_features, axis=0)

# 提取所有图像的特征
print("Starting feature extraction...")
image_features = extract_features(image_files)
print("Feature extraction completed.")

# 将特征向量保存为numpy数组
np.save('skin_features.npy', image_features)
print("Features saved to skin_features.npy")



Defaulting to user installation because normal site-packages is not writeable
2298
Starting feature extraction...
Processed batch 1/72
Processed batch 2/72
Processed batch 3/72
Processed batch 4/72
Processed batch 5/72
Processed batch 6/72
Processed batch 7/72
Processed batch 8/72
Processed batch 9/72
Processed batch 10/72
Processed batch 11/72
Processed batch 12/72
Processed batch 13/72
Processed batch 14/72
Processed batch 15/72
Processed batch 16/72
Processed batch 17/72
Processed batch 18/72
Processed batch 19/72
Processed batch 20/72
Processed batch 21/72
Processed batch 22/72
Processed batch 23/72
Processed batch 24/72
Processed batch 25/72
Processed batch 26/72
Processed batch 27/72
Processed batch 28/72
Processed batch 29/72
Processed batch 30/72
Processed batch 31/72
Processed batch 32/72
Processed batch 33/72
Processed batch 34/72
Processed batch 35/72
Processed batch 36/72
Processed batch 37/72
Processed batch 38/72
Processed batch 39/72
Processed batch 40/72
Processed batch

In [7]:
# 加载提取的图像特征
image_features = np.load('skin_features.npy')

# 将特征转换为DataFrame
feature_columns = [f'feature_{i+1}' for i in range(image_features.shape[1])]
features_df = pd.DataFrame(image_features, columns=feature_columns)

# 加载预处理后的metadata
metadata_preprocessed = pd.read_csv(os.path.join(path_to_data, 'metadata_preprocessed.csv'))

# 合并图像特征和metadata
metadata_combined = pd.concat([metadata_preprocessed, features_df], axis=1)

# 保存合并后的数据
metadata_combined.to_csv(os.path.join(path_to_data, 'metadata_with_features.csv'), index=False)
print("Combined metadata and features saved to metadata_with_features.csv")


Combined metadata and features saved to metadata_with_features.csv


In [8]:
# 设置设备
device = "cuda" if torch.cuda.is_available() else "cpu"

# 加载CLIP模型和预处理方法
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='openai')

# 指定数据目录路径
path_to_data = 'C:\\Users\\huoqingz\\Desktop\\PAD-UFES-20'

# 定义目标文件夹
destination_folder = 'all_images'

# 创建目标文件夹
os.makedirs(destination_folder, exist_ok=True)

# 定义一个函数，将文件移动到目标文件夹
def move_files_to_dest_folder(path, dest):
    for root, _, files in os.walk(path):
        for file in files:
            src_file = os.path.join(root, file)
            dest_file = os.path.join(dest, file)
            shutil.copy(src_file, dest_file)

# 将每个子文件夹中的文件移动到目标文件夹
move_files_to_dest_folder(os.path.join(path_to_data, 'images', 'imgs_part_1'), destination_folder)
move_files_to_dest_folder(os.path.join(path_to_data, 'images', 'imgs_part_2'), destination_folder)
move_files_to_dest_folder(os.path.join(path_to_data, 'images', 'imgs_part_3'), destination_folder)

# 打印目标文件夹中的文件数量
print(len(os.listdir(destination_folder)))

# 获取图像文件列表
image_files = [os.path.join(destination_folder, f) for f in os.listdir(destination_folder) if f.endswith('.jpg') or f.endswith('.png')]

# 预处理图像并提取特征
def preprocess_images(image_paths):
    images = []
    original_sizes = []
    for image_path in image_paths:
        try:
            image = Image.open(image_path)
            original_sizes.append(image.size)  # 保存原始尺寸
            images.append(preprocess(image).unsqueeze(0))
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
    return torch.cat(images).to(device), original_sizes

# 批处理特征提取函数
def extract_features(image_files, batch_size=32):
    all_features = []
    all_sizes = []
    num_batches = len(image_files) // batch_size + (1 if len(image_files) % batch_size != 0 else 0)

    for i in range(num_batches):
        batch_files = image_files[i * batch_size:(i + 1) * batch_size]
        images, sizes = preprocess_images(batch_files)
        
        with torch.no_grad():
            image_features = model.encode_image(images)
        
        all_features.append(image_features.cpu().numpy())
        all_sizes.extend(sizes)
        print(f"Processed batch {i+1}/{num_batches}")
    
    return np.concatenate(all_features, axis=0), all_sizes

# 提取所有图像的特征
print("Starting feature extraction...")
image_features, image_sizes = extract_features(image_files)
print("Feature extraction completed.")

# 将特征向量保存为numpy数组
np.save('skin_features.npy', image_features)
print("Features saved to skin_features.npy")

# 打印原始图像尺寸和提取的特征维度
print(f"Example original size: {image_sizes[0]}")
print(f"Example feature shape: {image_features[0].shape}")
print(f"Example feature values: {image_features[0]}")


2298
Starting feature extraction...
Processed batch 1/72
Processed batch 2/72
Processed batch 3/72
Processed batch 4/72
Processed batch 5/72
Processed batch 6/72
Processed batch 7/72
Processed batch 8/72
Processed batch 9/72
Processed batch 10/72
Processed batch 11/72
Processed batch 12/72
Processed batch 13/72
Processed batch 14/72
Processed batch 15/72
Processed batch 16/72
Processed batch 17/72
Processed batch 18/72
Processed batch 19/72
Processed batch 20/72
Processed batch 21/72
Processed batch 22/72
Processed batch 23/72
Processed batch 24/72
Processed batch 25/72
Processed batch 26/72
Processed batch 27/72
Processed batch 28/72
Processed batch 29/72
Processed batch 30/72
Processed batch 31/72
Processed batch 32/72
Processed batch 33/72
Processed batch 34/72
Processed batch 35/72
Processed batch 36/72
Processed batch 37/72
Processed batch 38/72
Processed batch 39/72
Processed batch 40/72
Processed batch 41/72
Processed batch 42/72
Processed batch 43/72
Processed batch 44/72
Proce

In [11]:
!pip install tensorly
import tensorly as tl
from tensorly.decomposition import tucker
# 定义Tucker分解的秩 (例如，将512维度降到50维)
ranks = [50]

# 进行Tucker分解
core, tucker_factors = tucker(image_features, ranks=ranks)

# 打印Tucker分解后的核心张量和因子矩阵的形状
print("Core tensor shape:", core.shape)
for i, factor in enumerate(tucker_factors):
    print(f"Factor matrix {i} shape:", factor.shape)

# 将Tucker分解后的特征保存为numpy数组
np.save('skin_features_tucker.npy', core)
print("Tucker decomposed features saved to skin_features_tucker.npy")

# 打印原始图像尺寸和Tucker分解后的特征维度
print(f"Example original size: {image_sizes[0]}")
print(f"Example Tucker feature shape: {core.shape}")

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorly
  Downloading tensorly-0.8.1-py3-none-any.whl (229 kB)
Installing collected packages: tensorly
Successfully installed tensorly-0.8.1


TypeError: tucker() got an unexpected keyword argument 'ranks'

In [12]:
!pip install open_clip_torch
!pip install tensorly
import os
import shutil
from PIL import Image
import warnings
import numpy as np
import pandas as pd
import torch
import open_clip
import tensorly as tl
from tensorly.decomposition import parafac

warnings.filterwarnings('ignore')
# 设置设备
device = "cuda" if torch.cuda.is_available() else "cpu"

# 加载CLIP模型和预处理方法
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='openai')

# 指定数据目录路径
path_to_data = 'C:\\Users\\huoqingz\\Desktop\\PAD-UFES-20'

# 定义目标文件夹
destination_folder = 'all_images'

# 创建目标文件夹
os.makedirs(destination_folder, exist_ok=True)

# 定义一个函数，将文件移动到目标文件夹
def move_files_to_dest_folder(path, dest):
    for root, _, files in os.walk(path):
        for file in files:
            src_file = os.path.join(root, file)
            dest_file = os.path.join(dest, file)
            shutil.copy(src_file, dest_file)

# 将每个子文件夹中的文件移动到目标文件夹
move_files_to_dest_folder(os.path.join(path_to_data, 'images', 'imgs_part_1'), destination_folder)
move_files_to_dest_folder(os.path.join(path_to_data, 'images', 'imgs_part_2'), destination_folder)
move_files_to_dest_folder(os.path.join(path_to_data, 'images', 'imgs_part_3'), destination_folder)

# 打印目标文件夹中的文件数量
print(len(os.listdir(destination_folder)))

# 获取图像文件列表
image_files = [os.path.join(destination_folder, f) for f in os.listdir(destination_folder) if f.endswith('.jpg') or f.endswith('.png')]

# 预处理图像并提取特征
def preprocess_images(image_paths):
    images = []
    original_sizes = []
    for image_path in image_paths:
        try:
            image = Image.open(image_path)
            original_sizes.append(image.size)  # 保存原始尺寸
            images.append(preprocess(image).unsqueeze(0))
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
    return torch.cat(images).to(device), original_sizes

# 批处理特征提取函数
def extract_features(image_files, batch_size=32):
    all_features = []
    all_sizes = []
    num_batches = len(image_files) // batch_size + (1 if len(image_files) % batch_size != 0 else 0)

    for i in range(num_batches):
        batch_files = image_files[i * batch_size:(i + 1) * batch_size]
        images, sizes = preprocess_images(batch_files)
        
        with torch.no_grad():
            image_features = model.encode_image(images)
        
        all_features.append(image_features.cpu().numpy())
        all_sizes.extend(sizes)
        print(f"Processed batch {i+1}/{num_batches}")
    
    return np.concatenate(all_features, axis=0), all_sizes

# 提取所有图像的特征
print("Starting feature extraction...")
image_features, image_sizes = extract_features(image_files)
print("Feature extraction completed.")

# 使用TOCA进行降维
# 将512维特征降维到16维
rank = 16

# TOCA分解
core, factors = parafac(image_features, rank=rank)

# 打印TOCA分解后的核心张量和因子矩阵的形状
print("Core tensor shape:", core.shape)
for i, factor in enumerate(factors):
    print(f"Factor matrix {i} shape:", factor.shape)

# 将TOCA分解后的特征保存为numpy数组
np.save('skin_features_toca.npy', core)
print("TOCA decomposed features saved to skin_features_toca.npy")

# 打印原始图像尺寸和TOCA分解后的特征维度
print(f"Example original size: {image_sizes[0]}")
print(f"Example TOCA feature shape: {core.shape}")


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
2298
Starting feature extraction...
Processed batch 1/72
Processed batch 2/72
Processed batch 3/72
Processed batch 4/72
Processed batch 5/72
Processed batch 6/72
Processed batch 7/72
Processed batch 8/72
Processed batch 9/72
Processed batch 10/72
Processed batch 11/72
Processed batch 12/72
Processed batch 13/72
Processed batch 14/72
Processed batch 15/72
Processed batch 16/72
Processed batch 17/72
Processed batch 18/72
Processed batch 19/72
Processed batch 20/72
Processed batch 21/72
Processed batch 22/72
Processed batch 23/72
Processed batch 24/72
Processed batch 25/72
Processed batch 26/72
Processed batch 27/72
Processed batch 28/72
Processed batch 29/72
Processed batch 30/72
Processed batch 31/72
Processed batch 32/72
Processed batch 33/72
Processed batch 34/72
Processed batch 35/72
Processed batch 36/72
Processed batch 37/72
Pro

In [15]:
# 使用TOCA进行降维
# 将512维特征降维到16维
rank = 16

# TOCA分解
core, factors = parafac(image_features, rank=rank)

# 打印TOCA分解后的核心张量和因子矩阵的形状
print("Core tensor shape:", core.shape)
for i, factor in enumerate(factors):
    print(f"Factor matrix {i} shape:", factor.shape)

# 提取降维后的特征
# 通常会使用因子矩阵中的特征进行进一步分析
toca_features = factors[0]

# 将TOCA分解后的特征保存为numpy数组
np.save('skin_features_toca.npy', toca_features)
print("TOCA decomposed features saved to skin_features_toca.npy")

# 打印原始图像尺寸和TOCA分解后的特征维度
print(f"Example original size: {image_sizes[0]}")
print(f"Example TOCA feature shape: {toca_features.shape}")


Core tensor shape: (16,)
Factor matrix 0 shape: (2298, 16)
Factor matrix 1 shape: (512, 16)
TOCA decomposed features saved to skin_features_toca.npy
Example original size: (609, 609)
Example TOCA feature shape: (2298, 16)


In [16]:
# 读取metadata
metadata = pd.read_csv(os.path.join(path_to_data, 'metadata.csv'))

# 打印metadata的前几行
print(metadata.head())

# 添加图像文件的完整路径到metadata中
metadata['full_link'] = os.path.join(path_to_data, 'Dataprep', 'all_images', '') + metadata['img_id']

# 加载TOCA分解后的特征
toca_features = np.load('skin_features_toca.npy')

# 将特征转换为DataFrame
feature_columns = [f'feature_{i+1}' for i in range(toca_features.shape[1])]
features_df = pd.DataFrame(toca_features, columns=feature_columns)

# 合并图像特征和metadata
metadata_combined = pd.concat([metadata, features_df], axis=1)

# 保存合并后的数据
metadata_combined.to_csv(os.path.join(path_to_data, 'metadata_with_toca_features.csv'), index=False)
print("Combined metadata and features saved to metadata_with_toca_features.csv")

# 打印合并后的表格
print(metadata_combined.head())


  patient_id  lesion_id  smoke  drink background_father background_mother  \
0   PAT_1516       1765    NaN    NaN               NaN               NaN   
1     PAT_46        881  False  False         POMERANIA         POMERANIA   
2   PAT_1545       1867    NaN    NaN               NaN               NaN   
3   PAT_1989       4061    NaN    NaN               NaN               NaN   
4    PAT_684       1302  False   True         POMERANIA         POMERANIA   

   age pesticide  gender skin_cancer_history  ... diameter_2 diagnostic  \
0    8       NaN     NaN                 NaN  ...        NaN        NEV   
1   55     False  FEMALE                True  ...        5.0        BCC   
2   77       NaN     NaN                 NaN  ...        NaN        ACK   
3   75       NaN     NaN                 NaN  ...        NaN        ACK   
4   79     False    MALE                True  ...        5.0        BCC   

    itch   grew   hurt  changed  bleed elevation                 img_id  \
0  False  F