**Data cleaning**

In [1]:
import pandas as pd
import shutil
import os
from PIL import Image
import imagehash
import re

In [2]:
image_df = pd.read_csv('Data/data1-294.csv')
image_df.head()

Unnamed: 0,skin_disorder_name,images
0,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...
1,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...
2,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...
3,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...
4,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...


## **Cleaning  acne images**

In [3]:
acne_labels = image_df[image_df['skin_disorder_name'].str.contains('acne')]['skin_disorder_name'].unique()
acne_labels

array(['acne affecting the back images', 'acne affecting the face images',
       'acne and other follicular disorder images', 'facial acne images',
       'infantile acne images', 'steroid acne images'], dtype=object)

In [4]:
# There are six labels representing acne
len(acne_labels)

6

In [5]:
# Crating a dataframe with just acne labels for easier cleaning

acne_df = image_df[(image_df['skin_disorder_name'] == acne_labels[0]) | \
                   (image_df['skin_disorder_name'] == acne_labels[1]) | \
                   (image_df['skin_disorder_name'] == acne_labels[2]) | \
                   (image_df['skin_disorder_name'] == acne_labels[3]) | \
                   (image_df['skin_disorder_name'] == acne_labels[4]) | \
                   (image_df['skin_disorder_name'] == acne_labels[5]) 
                 ]
acne_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 718 entries, 0 to 12648
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   skin_disorder_name  718 non-null    object
 1   images              718 non-null    object
dtypes: object(2)
memory usage: 16.8+ KB


**extra acne images:**

In [6]:
extra_acne = [image_name for image_name in os.listdir('extra_acne_images')]
extra_acne[:10]

['07Acne081101.jpg',
 '07Acne0811011 - Copy.jpg',
 '07Acne0811011.jpg',
 '07AcnePittedScars.jpg',
 '07AcnePittedScars1 - Copy.jpg',
 '07AcnePittedScars1.jpg',
 '07RosaceaFulFAce.jpg',
 '07RosaceaFulFAce1 - Copy.jpg',
 '07RosaceaFulFAce1.jpg',
 '07RosaceaOK0828063.jpg']

In [7]:
#Creating a dataframe for the extra acne images
label =['acne' for img in extra_acne]
extra_acne_df = pd.DataFrame(extra_acne, label).reset_index()
extra_acne_df.columns =['skin_disorder_name', 'images']
extra_acne_df.head()

Unnamed: 0,skin_disorder_name,images
0,acne,07Acne081101.jpg
1,acne,07Acne0811011 - Copy.jpg
2,acne,07Acne0811011.jpg
3,acne,07AcnePittedScars.jpg
4,acne,07AcnePittedScars1 - Copy.jpg


**Moving acne images to their own folder**

In [8]:
# Getting the acne images file names
acne_img = [image_name for image_name in os.listdir('Images/') if 'acne' in image_name] 

# Confirming the number of acne images before any cleaning
print('There are', len(acne_img),'acne images')
acne_img[:10]

There are 718 acne images


['acne affecting the back images0.jpg',
 'acne affecting the back images1.jpg',
 'acne affecting the back images10.jpg',
 'acne affecting the back images11.jpg',
 'acne affecting the back images12.jpg',
 'acne affecting the back images13.jpg',
 'acne affecting the back images14.jpg',
 'acne affecting the back images15.jpg',
 'acne affecting the back images16.jpg',
 'acne affecting the back images17.jpg']

In [9]:
# Creating a new folder with just acne images to make cleaning easier

os.mkdir('acne_images/')
for img in acne_img:
    origin = os.path.join('Images/', img)
    destination = os.path.join('acne_images/', img)
    shutil.copy(origin, destination)

In [10]:
# Confirming that the number of acne images after moving them to a separate folder is still 718
original_acne_img = [image_name for image_name in os.listdir('acne_images/')] 
print('There are', len(original_acne_img),'acne images')

There are 718 acne images


**Dropping links from the 'images' column in the acne and replacing them with the image name**

In [11]:
# So that the two dataframes can match, we dropped the image links in  acne_df 
# and replaced them with the image names

acne_images = pd.DataFrame(acne_img, columns=['images'])
acne_df = acne_df.copy()
acne_df.drop('images', axis=1, inplace=True)
acne_df['images'] = acne_images['images'].values
acne_df.head()

Unnamed: 0,skin_disorder_name,images
0,acne affecting the back images,acne affecting the back images0.jpg
1,acne affecting the back images,acne affecting the back images1.jpg
2,acne affecting the back images,acne affecting the back images10.jpg
3,acne affecting the back images,acne affecting the back images11.jpg
4,acne affecting the back images,acne affecting the back images12.jpg


**Joining the two dataframes**

In [16]:
# Creating a dataframe with all of the acne images

acne_df_complete = pd.concat([acne_df, extra_acne_df], axis=0).reset_index()
acne_df_complete.drop('index', axis=1, inplace=True)
print(acne_df_complete.info())
acne_df_complete.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2540 entries, 0 to 2539
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   skin_disorder_name  2540 non-null   object
 1   images              2540 non-null   object
dtypes: object(2)
memory usage: 39.8+ KB
None


Unnamed: 0,skin_disorder_name,images
0,acne affecting the back images,acne affecting the back images0.jpg
1,acne affecting the back images,acne affecting the back images1.jpg
2,acne affecting the back images,acne affecting the back images10.jpg
3,acne affecting the back images,acne affecting the back images11.jpg
4,acne affecting the back images,acne affecting the back images12.jpg
...,...,...
2535,acne,unnamed.jpg
2536,acne,uq3mypax59u51.png
2537,acne,What-Is-Adult-Acne.jpg
2538,acne,What-is-Cystic-Acne-and-Why-is-It-Caused (1).jpg


**Combining the images into one folder**

In [17]:
# This was done by moving the extra images into the acne folder
for img in extra_acne:
    origin = os.path.join('extra_acne_images/', img)
    destination = os.path.join('acne_images/', img)
    shutil.copy(origin, destination)

In [18]:
# Confirming that the total acne images is 2540 before any cleaning

acne_img = [image_name for image_name in os.listdir('acne_images/')] 
print('There are a total of', len(acne_img),'acne images')

There are a total of 2540 acne images


**Removing duplicated images from the folder**

In [19]:
# Define a threshold for image similarity
threshold = 8


# Define a dictionary to store the hash values and file paths of the images
image_hashes = {}
duplicated_images = []
# Loop through all the image files in a directory
for filename in os.listdir("acne_images"):
    # Load the image file
    image = Image.open(os.path.join("acne_images", filename))

    # Compute the hash value of the image using the average hash algorithm
    hash_value = imagehash.average_hash(image)

    # Check if the hash value is already in the dictionary
    if hash_value in image_hashes:
        # If a similar hash value already exists, delete the duplicate image
        duplicated_images.append(filename)
        os.remove(os.path.join("acne_images", filename))
    else:
        # Otherwise, add the hash value and file path to the dictionary
        image_hashes[hash_value] = os.path.join("acne_images", filename)

In [20]:
acne_img = [image_name for image_name in os.listdir('acne_images/')] 
print('There are', len(acne_img),'acne images after removing duplicated images')

There are 2169 acne images after removing duplicated images


In [21]:
# Getting the indexes of the duplicated images so that they can be dropped from the acne_df_complete too.

duplicated_indexes = [acne_df_complete[acne_df_complete['images'] == image_name].index[0] \
                      for image_name in acne_df_complete['images']\
                      if image_name in duplicated_images]
duplicated_indexes[:10]

[369, 377, 394, 430, 448, 449, 450, 451, 452, 453]

In [22]:
# Dropping duplicated images from the dataframe.
acne_df_complete = acne_df_complete.copy()
acne_df_complete.drop(index=duplicated_indexes, inplace=True)
acne_df_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2169 entries, 0 to 2538
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   skin_disorder_name  2169 non-null   object
 1   images              2169 non-null   object
dtypes: object(2)
memory usage: 50.8+ KB


Acne affecting the back images, Acne affecting the face images all have correct images. The Only change that will be made is changing the name to acne. </br>
Acne and other follicular disorder images has a collection of different images. Only images that have acne as a specific label will be included, the others will be dropped from the dataset. </br>
***
***Dealing with the collection of different images in Acne and other follicular disorder images***</br>
After careful evaluation of the images, the images that represent acne are:</br>
>>> ***[299, 301, 302, 305, 306, 312, 324, 327, 331, 332, 334, 335, 336,</br>, 340, 352, 353, 356, 357, 358, 360, 365, 370, 379, 383, 386, 391, </br>, 394, 399, 400, 401, 404, 406, 407, 410, 410, 412, 414, 418, 435, </br> 439, 440, 442]***

In [23]:
# Note 📝: The indexes were confirmed to be the same even after merging the two dataframes
        #: This is because the acne_df is at the top in the complete dataframe

# indexes of the images in 'acne and other follicular disorder images'
indexes = acne_df[acne_df['skin_disorder_name'] == 'acne and other follicular disorder images'].index

# indexes of the acne images in 'acne and other follicular disorder images'
acne_indexes = [299, 301, 302, 305, 306, 312, 324, 327, 331, 332, 334, 335, 336,
                340, 352, 353, 356, 357, 358, 360, 365, 370, 379, 383, 386, 391,
                394, 399, 400, 401, 404, 406, 407, 410, 410, 412, 414, 418, 435,
                439, 440, 442]

# indexes of the other follicular disorder images in 'acne and other follicular disorder images'. This indexes will be dropped.
to_drop = []

for index in  indexes:
    if (index not in acne_indexes) and (index not in duplicated_indexes):
        to_drop.append(index)

# dropping indexes in to_drop
acne_df_complete.drop(to_drop, axis = 0, inplace=True)

In [24]:
# After dropping non-acne images, we still have 328 images left
acne_df_complete.shape

(2060, 2)

In [25]:
# dropping those images from the acne_images folder

# Finding the image file names to be dropped from the folder
img_to_drop = []

for index in to_drop:
    for img_name in original_acne_img:
        if str(index) in img_name:
            img_to_drop.append(img_name)

# Dropping those images form the acne_images folder
for filename in img_to_drop:
    os.remove(os.path.join("acne_images", filename))

# Confirming that the number of images left is 328
acne_img = [image_name for image_name in os.listdir('acne_images/')] 
print('There are', len(acne_img),'acne images left.')

There are 2060 acne images left.


**Changing the label to just acne**

In [26]:
acne_df_complete['skin_disorder_name'] = 'acne'
acne_df_complete

Unnamed: 0,skin_disorder_name,images
0,acne,acne affecting the back images0.jpg
1,acne,acne affecting the back images1.jpg
2,acne,acne affecting the back images10.jpg
3,acne,acne affecting the back images11.jpg
4,acne,acne affecting the back images12.jpg
...,...,...
2533,acne,uaxnc5tsay351.jpg
2534,acne,unnamed (1).jpg
2536,acne,uq3mypax59u51.png
2537,acne,What-Is-Adult-Acne.jpg


## **Cleaning  Eczema images**

In [31]:
eczema_labels = image_df[(image_df['skin_disorder_name'].str.contains('eczema')) | \
                         (image_df['skin_disorder_name'].str.contains('atopic dermatitis images')) |\
                         (image_df['skin_disorder_name'].str.contains('hand dermatitis images')) |\
                         (image_df['skin_disorder_name'] == 'dermatitis images') |\
                         (image_df['skin_disorder_name'].str.contains('nummular dermatitis images'))] \
                         ['skin_disorder_name'].unique()
len(eczema_labels)

8

In [32]:
# Crating a dataframe with just eczema labels for easier cleaning

eczema_df = image_df[(image_df['skin_disorder_name'] == eczema_labels[0]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[1]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[2]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[3]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[4]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[5]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[6]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[7]) 
                 ]
eczema_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 631 entries, 1058 to 8989
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   skin_disorder_name  631 non-null    object
 1   images              631 non-null    object
dtypes: object(2)
memory usage: 14.8+ KB


**i. Moving eczema images to their own folder**

In [33]:
# Getting the eczema images file names
eczema_img = [image_name for image_name in os.listdir('Images/') if ('eczema' in image_name) |
                                                                    ('atopic dermatitis images' in image_name) |
                                                                    ('hand dermatitis images' in image_name) | 
                                                                    (image_name.startswith('dermatitis images'))|
                                                                    ('nummular dermatitis images' in image_name)
                                                                     ] 

# Confirming the number of eczema images before any cleaning
print('There are', len(eczema_img),'eczema images')
eczema_img[:10]

There are 631 eczema images


['atopic dermatitis images1058.jpg',
 'atopic dermatitis images1059.jpg',
 'atopic dermatitis images1060.jpg',
 'atopic dermatitis images1061.jpg',
 'atopic dermatitis images1062.jpg',
 'atopic dermatitis images1063.jpg',
 'atopic dermatitis images1064.jpg',
 'atopic dermatitis images1065.jpg',
 'atopic dermatitis images1066.jpg',
 'atopic dermatitis images1067.jpg']

In [35]:
# Creating a new folder with just eczema images to make cleaning easier

os.mkdir('eczema_images/')
for img in eczema_img:
    origin = os.path.join('Images/', img)
    destination = os.path.join('eczema_images/', img)
    shutil.copy(origin, destination)

In [36]:
# Confirming that the number of eczema images after moving them to a separate folder is still 631
eczema_img = [image_name for image_name in os.listdir('eczema_images/')] 
print('There are', len(eczema_img),' eczema images')

There are 631  eczema images


**ii. Removing duplicated images from the folder**

In [37]:
# Define a threshold for image similarity
threshold = 8


# Define a dictionary to store the hash values and file paths of the images
image_hashes = {}
duplicated_images = []
# Loop through all the image files in a directory
for filename in os.listdir("eczema_images"):
    # Load the image file
    image = Image.open(os.path.join("eczema_images", filename))

    # Compute the hash value of the image using the average hash algorithm
    hash_value = imagehash.average_hash(image)

    # Check if the hash value is already in the dictionary
    if hash_value in image_hashes:
        # If a similar hash value already exists, delete the duplicate image
        duplicated_images.append(filename)
        os.remove(os.path.join("eczema_images", filename))
    else:
        # Otherwise, add the hash value and file path to the dictionary
        image_hashes[hash_value] = os.path.join("eczema_images", filename)

In [38]:
eczema_img = [image_name for image_name in os.listdir('eczema_images/')] 
print('There are', len(eczema_img),'eczema images after removing duplicated images')

There are 314 eczema images after removing duplicated images


In [39]:
# Getting the indexes of the duplicated images so that they can be dropped from the eczema_df too.
duplicated_indexes = []
for img_name in duplicated_images:
    match = re.search(r'\d+', img_name)
    if match:
        number = match.group()
        duplicated_indexes.append(int(number))
duplicated_indexes[:10]

[1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156]

In [40]:
# Dropping duplicated images from the dataframe.
eczema_df = eczema_df.copy()
eczema_df.drop(duplicated_indexes, axis=0, inplace=True)
eczema_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 314 entries, 1058 to 5964
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   skin_disorder_name  314 non-null    object
 1   images              314 non-null    object
dtypes: object(2)
memory usage: 7.4+ KB
