In [5]:
import pandas as pd
import os
import cv2
import numpy as np
import shutil
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample


In [6]:
# Load the metadata file for skin cancer
metadata = pd.read_csv('./Skin Dataset/HAM10000_metadata.csv')

In [8]:
# Change label of each class to exact classes
df1 = metadata[metadata.dx=='nv']
df2 = metadata[metadata.dx=='mel']
df3 = metadata[metadata.dx=='bkl']
df4 = metadata[metadata.dx=='bcc']
df5 = metadata[metadata.dx=='akiec']
df6 = metadata[metadata.dx=='vasc']
df7 = metadata[metadata.dx=='df']

In [33]:
# Resample the entries to a constant size
samples = 130

df1_balanced = resample(df1,replace = True,n_samples = samples,random_state = 2)
df2_balanced = resample(df2,replace = True,n_samples = samples,random_state = 2)
df3_balanced = resample(df3,replace = True,n_samples = samples,random_state = 2)
df4_balanced = resample(df4,replace = True,n_samples = samples,random_state = 2)
df5_balanced = resample(df5,replace = True,n_samples = samples,random_state = 2)
df6_balanced = resample(df6,replace = True,n_samples = samples,random_state = 2)
df7_balanced = resample(df7,replace = True,n_samples = samples,random_state = 2)

In [34]:
df2_balanced

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
1738,HAM_0006824,ISIC_0032958,mel,histo,70.0,male,lower extremity
1704,HAM_0006566,ISIC_0033569,mel,histo,65.0,male,back
1510,HAM_0007343,ISIC_0033946,mel,histo,70.0,male,chest
1677,HAM_0007086,ISIC_0033863,mel,histo,65.0,female,back
2310,HAM_0003102,ISIC_0032389,mel,histo,65.0,male,face
...,...,...,...,...,...,...,...
2150,HAM_0004507,ISIC_0029958,mel,histo,65.0,female,face
1920,HAM_0001570,ISIC_0032872,mel,histo,55.0,male,back
2298,HAM_0004746,ISIC_0028764,mel,histo,65.0,female,back
1333,HAM_0001531,ISIC_0024951,mel,histo,70.0,male,back


In [35]:
merged = pd.concat([df1_balanced,df2_balanced,df3_balanced,
                  df4_balanced,df5_balanced,df6_balanced,df7_balanced])

In [36]:
merged.dx.value_counts()

dx
nv       130
mel      130
bkl      130
bcc      130
akiec    130
vasc     130
df       130
Name: count, dtype: int64

In [38]:
# Preprocess the data
image_size = 250

X = []
y = []

dst_dir = './Skin Dataset/Dataset-250/'

count = 1
for index, row in merged.iterrows():
    if index % 1000 == 0:
        print(f"Processing image {index}")
    img_id = row['image_id'] + '.jpg'
    img_path1 = os.path.join('./Skin Dataset/Skin Cancer/Skin Cancer', img_id)
    if os.path.exists(img_path1):
        img_path = img_path1
    else:
        print(f"Image file does not exist: {img_id}")
        continue
#     print(img_path)
#     print(row['dx'])
    img = cv2.imread(img_path)
    if img is None:
        print(f"Error loading image: {img_path}")
        continue
    img = cv2.resize(img, (image_size, image_size))
#     X.append(img)
#     y.append(row['dx'])
    
    if not os.path.exists(dst_dir + row['dx']):
        os.makedirs(dst_dir + row['dx'], exist_ok=True)
    cv2.imwrite(dst_dir + row['dx'] + '/{}.jpg'.format(count) , img)
    count += 1
#     shutil.copy(img_path, dst_dir + row['dx'])


## Data Augmentation

In [9]:
def dataGenerator(type_, number):
    import tensorflow as tf
    from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
    
    '''
     type_ :str 
        ex 'CAT' or 'DOG'
     number :int 
        duplicate img x {number}
    '''
    
    datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=20, 
    width_shift_range=0.1,
    height_shift_range=0.1, 
    shear_range=0.1,
    zoom_range=0.1,
    fill_mode='nearest',
    horizontal_flip=True,
    )
    counter = 0
    for filename in os.listdir(f'./Breast Dataset/Dataset/{type_}/'):
        if filename.endswith('.png'):
            img = load_img(f'./Breast Dataset/Dataset/{type_}/{filename}')
            resized_image = tf.image.resize(img, tf.constant([250, 250]))
            x = img_to_array(resized_image)
            x = x.reshape((1,) + x.shape)

            i = 0
            for batch in datagen.flow(x, batch_size=1, save_to_dir=f'./Breast Dataset/Augmented-Data/{type_}', save_prefix='brst', save_format='jpg'):
                i += 1
                counter += 1
                if i == number:
                    break
                    
        if counter > 3400:
            break

In [10]:
dataGenerator('benign' , 2)

## Data Splitting

In [12]:
import splitfolders

splitfolders.ratio("./Breast Dataset/Augmented-Data/", output="./Breast Dataset/Splitted", seed=1337, ratio=(.60, .20, .20), group_prefix=None, move=False)

Copying files: 10608 files [00:06, 1598.68 files/s]


## Image Resizing for Datasets

In [11]:
# Resize images for lung dataset
import os
import cv2
img_path = './Breast Dataset/Dataset/'
dst_dir = './Breast Dataset/Augmented-Data/'
image_size = 250

for foldername in os.listdir(img_path):
    for filename in os.listdir(img_path + foldername):
#         print(filename)
        if filename.endswith('.png'):
            img = cv2.imread(img_path + foldername + "/" + filename)
            if img is None:
                print(f"Error loading image: {img_path}")
                continue
                
            img = cv2.resize(img, (image_size, image_size))
#         #     X.append(img)
#         #     y.append(row['dx'])

            if not os.path.exists(dst_dir + foldername):
                os.makedirs(dst_dir + foldername, exist_ok=True)
            cv2.imwrite(dst_dir + foldername + '/{}.jpg'.format(filename) , img)
#             break
    print(foldername + ' Done')

benign Done
malignant Done
