In [16]:
# Surpress warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import numpy as np
import pandas as pd
import sys
import os
import PIL
from pathlib import Path
from PIL import Image
import imagehash
from math import sqrt
import copy

import cv2
from skimage.metrics import structural_similarity as ssim

In [17]:
def compare_patient_skin_images(image1_path, image2_path):
    # Load the images
    img1 = Image.open(image1_path)
    img2 = Image.open(image2_path)

    # Standardize images to a common size for comparison
    standard_size = (640, 480)  # Adjust based on your typical image resolution
    img1_resized = img1.thumbnail(standard_size)
    img2_resized = img2.thumbnail(standard_size)

    # Convert to RGB if needed
    if img1_resized.mode != 'RGB':
        img1_resized = img1_resized.convert('RGB')
    if img2_resized.mode != 'RGB':
        img2_resized = img2_resized.convert('RGB')

    # Calculate multiple perceptual hashes (more robust for medical images)
    phash1 = imagehash.phash(img1_resized)
    phash2 = imagehash.phash(img2_resized)
    phash_diff = phash1 - phash2

    # Average hash is more sensitive to color changes (important for skin conditions)
    ahash1 = imagehash.average_hash(img1_resized, hash_size=12)  # Larger hash size for more detail
    ahash2 = imagehash.average_hash(img2_resized, hash_size=12)
    ahash_diff = ahash1 - ahash2

    # Wavelet hash is good for capturing texture differences
    whash1 = imagehash.whash(img1_resized)
    whash2 = imagehash.whash(img2_resized)
    whash_diff = whash1 - whash2

    # Color histogram comparison (important for skin tone/condition changes)
    hist1 = img1_resized.histogram()
    hist2 = img2_resized.histogram()
    hist_diff = sqrt(sum((a - b) ** 2 for a, b in zip(hist1, hist2)) / len(hist1))

    return {
        "perceptual_hash_diff": phash_diff,
        "average_hash_diff": ahash_diff,
        "wavelet_hash_diff": whash_diff,
        "histogram_diff": hist_diff,
        "combined_hash_score": (phash_diff + ahash_diff + whash_diff) / 3
    }

def evaluate_skin_condition_similarity(image1_path, image2_path):
    result = compare_patient_skin_images(image1_path, image2_path)

    # Medical image thresholds - more lenient than general photo comparison
    # because we expect some variation in conditions
    phash_threshold = 15  # More lenient for skin images
    ahash_threshold = 18  # Color-sensitive hash
    whash_threshold = 15  # Texture-sensitive hash
    hist_threshold = 400  # More lenient for lighting variations
    combined_threshold = 15

    # Print detailed results
    print(f"Perceptual hash difference: {result['perceptual_hash_diff']}")
    print(f"Average hash difference: {result['average_hash_diff']}")
    print(f"Wavelet hash difference: {result['wavelet_hash_diff']}")
    print(f"Histogram difference: {result['histogram_diff']}")
    print(f"Combined hash score: {result['combined_hash_score']}")

    # Assess condition similarity using multiple metrics
    if (result['combined_hash_score'] < combined_threshold or
            (result['perceptual_hash_diff'] < phash_threshold and
             result['wavelet_hash_diff'] < whash_threshold)):
        print("Assessment: Images show similar skin condition")
        return True
    else:
        print("Assessment: Images likely show different or changed skin condition")
        return False

In [18]:
duplicates_df = pd.read_csv('../data/ISIC_2020_Training_Duplicates.csv')
duplicates_df

Unnamed: 0,image_name_1,image_name_2
0,ISIC_0079038,ISIC_8521950
1,ISIC_0087297,ISIC_4755972
2,ISIC_0088137,ISIC_4201955
3,ISIC_0112097,ISIC_5934021
4,ISIC_0148783,ISIC_7460560
...,...,...
420,ISIC_8879370,ISIC_9596146
421,ISIC_8889856,ISIC_9677697
422,ISIC_8987085,ISIC_9751079
423,ISIC_9218360,ISIC_9913406


In [19]:
metadata_df = pd.read_csv('../data/ISIC_2020_Training_GroundTruth_v2.csv')
metadata_df

Unnamed: 0,image_name,patient_id,lesion_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,IL_7972535,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,IL_4649854,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,IL_6898037,female,55.0,upper extremity,unknown,benign,0
...,...,...,...,...,...,...,...,...,...
33121,ISIC_9999134,IP_6526534,IL_2076932,male,50.0,torso,unknown,benign,0
33122,ISIC_9999320,IP_3650745,IL_6891604,male,65.0,torso,unknown,benign,0
33123,ISIC_9999515,IP_2026598,IL_6364820,male,20.0,lower extremity,unknown,benign,0
33124,ISIC_9999666,IP_7702038,IL_6048457,male,50.0,lower extremity,unknown,benign,0


In [20]:
grouped_metadata = metadata_df['patient_id'].value_counts().reset_index()
grouped_metadata

Unnamed: 0,patient_id,count
0,IP_7279968,115
1,IP_4382720,115
2,IP_4938382,115
3,IP_4479736,115
4,IP_0656529,114
...,...,...
2051,IP_7765635,3
2052,IP_3424938,3
2053,IP_8003098,3
2054,IP_3645134,3


In [21]:
duplicates_new_df = pd.DataFrame(columns=['image_name_1', 'image_name_2'])

In [24]:
unique_ids = grouped_metadata['patient_id'].tolist()
path = '../../Lumen_Image_Data/train/'
for id in unique_ids:
    
    patient_images = (metadata_df.where(metadata_df['patient_id'] == id)
                                .dropna(axis=0)
                                .reset_index()['image_name']
                                .tolist())
    
    for image1 in patient_images:
        
        if len(patient_images) <= 1:
            break
        
        image_1 = path + image1 + '.jpg'
        patient_images_subset = copy.deepcopy(patient_images)
        
        patient_images_subset.remove(image1)
        patient_images.remove(image1)
        
        for image2 in patient_images_subset:
            
            if len(patient_images_subset) <= 1:
                break
            
            image_2 = path + image2 + '.jpg'
            images_similar = evaluate_skin_condition_similarity(image_1, image_2)
            
            if images_similar:
                patient_images_subset.remove(image2)
                patient_images.remove(image2)

                duplicates_new_df.loc[len(duplicates_new_df)] = [image1, image2]
        

AttributeError: 'NoneType' object has no attribute 'mode'