In [16]:
import cv2
import pandas as pd
import numpy as np
import random
from skimage.transform import rotate
from skimage.feature import graycomatrix, graycoprops, hog
from skimage.measure import label, regionprops
from skimage.color import rgb2gray
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import os

In [17]:
def remove_hair(image, size=(600, 450)):
    image = cv2.resize(image, size)
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 9))
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)
    _, mask = cv2.threshold(blackhat, 10, 255, cv2.THRESH_BINARY)
    cleaned = cv2.inpaint(image, mask, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
    return cleaned

In [18]:
def augment_image(image):
    """Applies a random selection of augmentations to an image."""
    if random.choice([True, False]):
        image = cv2.flip(image, 1)  # Horizontal Flip
    if random.choice([True, False]):
        image = cv2.flip(image, 0)  # Vertical Flip

    # Random rotation
    angle = random.uniform(-15, 15)
    image = rotate(image, angle, resize=False, preserve_range=True, mode='reflect')
    image = image.astype(np.uint8)

    # Brightness adjustment
    hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
    h, s, v = cv2.split(hsv)
    value = random.uniform(0.8, 1.2)
    v = cv2.multiply(v, np.array([value]))
    v = np.clip(v, 0, 255).astype(np.uint8)
    final_hsv = cv2.merge((h, s, v))
    image = cv2.cvtColor(final_hsv, cv2.COLOR_HSV2RGB)

    return image

def extract_features_from_array(image_rgb):
    """Extracts features from an in-memory RGB image array."""
    # Preprocessing
    image = cv2.resize(image_rgb, (224, 224))
    image = remove_hair(image)
    image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)

    features = []

    hsv = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, [8, 8, 8], [0, 180, 0, 256, 0, 256])
    features.extend(cv2.normalize(hist, hist).flatten())

    glcm = graycomatrix(gray, [1], [0, np.pi/4, np.pi/2], symmetric=True, normed=True)
    for prop in ['contrast', 'correlation', 'energy', 'homogeneity']:
        features.append(graycoprops(glcm, prop)[0].mean())

    edges = cv2.Canny(gray, 100, 200)
    features.append(np.sum(edges > 0) / (224 * 224))

    hog_feat, _ = hog(rgb2gray(image), pixels_per_cell=(16, 16),
                      cells_per_block=(2, 2), visualize=True, feature_vector=True)
    features.extend(hog_feat[:100])

    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    props = regionprops(label(binary))
    if props:
        largest = max(props, key=lambda x: x.area)
        area, perimeter = largest.area, largest.perimeter
        circularity = (4 * np.pi * area) / (perimeter**2 + 1e-5)
        features.extend([area / (224*224), perimeter / (224*4), circularity])
    else:
        features.extend([0, 0, 0])

    return np.array(features)

In [None]:
base_dir = '/Users/conorhuh/Desktop/Berkeley/281/fp/skin-cancer-mnist-ham10000/' 
df = pd.read_csv(os.path.join(base_dir, 'HAM10000_metadata.csv'))

image_paths = {os.path.splitext(os.path.basename(x))[0]: os.path.join(base_dir, 'ham10000_images_part_1', x)
               for x in os.listdir(os.path.join(base_dir, 'ham10000_images_part_1'))}
image_paths.update({os.path.splitext(os.path.basename(x))[0]: os.path.join(base_dir, 'ham10000_images_part_2', x)
                    for x in os.listdir(os.path.join(base_dir, 'ham10000_images_part_2'))})
df['path'] = df['image_id'].map(image_paths.get)
df.dropna(inplace=True) 

# 70% train, 30% temp (for val/test)
train_df, temp_df = train_test_split(
    df, test_size=0.30, stratify=df['dx'], random_state=42)

# Split the 30% temp set into 15% validation and 15% test
val_df, test_df = train_test_split(
    temp_df, test_size=0.50, stratify=temp_df['dx'], random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")


# --- 2. Oversample the TRAINING Set ---
print("\n--- Processing and Oversampling Training Set ---")
# Get the new majority class count from the training set
train_majority_count = train_df['dx'].value_counts().max()
# Find the name of the majority class
majority_class_name = train_df['dx'].value_counts().idxmax()

target_counts = {dx_class: train_majority_count for dx_class in df['dx'].unique()}

X_train, y_train = [], []

for dx_class, target_count in target_counts.items():
    print(f"Processing class: {dx_class}")
    class_df = train_df[train_df['dx'] == dx_class]
    original_images = class_df['path'].tolist()
    current_count = len(original_images)

    if dx_class == majority_class_name:
        # For the MAJORITY class, replace each original image with an augmented version
        # This teaches the model transformation invariance for this class.
        for image_path in tqdm(original_images, desc=f"Augmenting majority {dx_class}"):
            original_image = cv2.imread(image_path)
            original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)
            augmented_image = augment_image(original_image) # Augment every image
            features = extract_features_from_array(augmented_image)
            X_train.append(features)
            y_train.append(dx_class)
    else:
        # For MINORITY classes, add the originals and then augment to reach the target count
        # 1. Add original images
        for image_path in tqdm(original_images, desc=f"Original {dx_class}"):
            image = cv2.imread(image_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            features = extract_features_from_array(image)
            X_train.append(features)
            y_train.append(dx_class)

        # 2. Augment to fill the gap
        num_to_generate = target_count - current_count
        if num_to_generate > 0:
            for i in tqdm(range(num_to_generate), desc=f"Augmenting minority {dx_class}"):
                random_path = random.choice(original_images)
                original_image = cv2.imread(random_path)
                original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)
                augmented_image = augment_image(original_image)
                features = extract_features_from_array(augmented_image)
                X_train.append(features)
                y_train.append(dx_class)


# --- 3. Process the VALIDATION and TEST Sets (NO Augmentation) ---
def process_set(dataframe, set_name):
    """Helper function to extract features for a given dataframe."""
    print(f"\n--- Processing {set_name} Set ---")
    X, y = [], []
    for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc=f"Processing {set_name}"):
        image = cv2.imread(row['path'])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        features = extract_features_from_array(image)
        X.append(features)
        y.append(row['dx'])
    return np.array(X), np.array(y)

X_val, y_val = process_set(val_df, "Validation")
X_test, y_test = process_set(test_df, "Test")

X_train = np.array(X_train)
y_train = np.array(y_train)

np.savez('skin_cancer_datasets.npz',
         X_train=X_train, y_train=y_train,
         X_val=X_val, y_val=y_val,
         X_test=X_test, y_test=y_test)

print("\nAll datasets saved to 'skin_cancer_datasets.npz'")
print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
print(f"Validation set shape: X={X_val.shape}, y={y_val.shape}")
print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")

Training set size: 6970
Validation set size: 1494
Test set size: 1494

--- Processing and Oversampling Training Set ---
Processing class: bkl


Original bkl: 100%|██████████| 762/762 [02:01<00:00,  6.29it/s]
Augmenting bkl:   0%|          | 0/3900 [00:00<?, ?it/s]


AttributeError: module 'cv2' has no attribute 'COLOR_RGB_HSV'