## Data cleaning

In [1]:
# import required libraries
#pip install imagehash
import pandas as pd
import shutil
import os
from PIL import Image
import imagehash
import re

In [2]:
# load and preview dataset
image_df = pd.read_csv('Data/data1-294.csv')
print(image_df.shape)
image_df.head()

(13992, 2)


Unnamed: 0,skin_disorder_name,images
0,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...
1,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...
2,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...
3,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...
4,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...


## **Cleaning  acne images**

In [3]:
# Labels representing acne in DermNet's scrapped data
acne_labels = image_df[image_df['skin_disorder_name'].str.contains('acne')]['skin_disorder_name'].unique()
acne_labels

array(['acne affecting the back images', 'acne affecting the face images',
       'acne and other follicular disorder images', 'facial acne images',
       'infantile acne images', 'steroid acne images'], dtype=object)

In [4]:
# There are six labels representing acne
len(acne_labels)

6

In [5]:
# Creating a dataframe with just acne labels for easier cleaning

acne_df = image_df[(image_df['skin_disorder_name'] == acne_labels[0]) | \
                   (image_df['skin_disorder_name'] == acne_labels[1]) | \
                   (image_df['skin_disorder_name'] == acne_labels[2]) | \
                   (image_df['skin_disorder_name'] == acne_labels[3]) | \
                   (image_df['skin_disorder_name'] == acne_labels[4]) | \
                   (image_df['skin_disorder_name'] == acne_labels[5]) 
                 ]
acne_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 718 entries, 0 to 12648
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   skin_disorder_name  718 non-null    object
 1   images              718 non-null    object
dtypes: object(2)
memory usage: 16.8+ KB


### **Extra acne images**

In [6]:
extra_acne = [image_name for image_name in os.listdir('extra_images/extra_acne_images')]
extra_acne[:10]

['07Acne081101.jpg',
 '07Acne0811011 - Copy.jpg',
 '07Acne0811011.jpg',
 '07AcnePittedScars.jpg',
 '07AcnePittedScars1 - Copy.jpg',
 '07AcnePittedScars1.jpg',
 '07RosaceaFulFAce.jpg',
 '07RosaceaFulFAce1 - Copy.jpg',
 '07RosaceaFulFAce1.jpg',
 '07RosaceaOK0828063.jpg']

In [7]:
#Creating a dataframe for the extra acne images

label =['acne' for img in extra_acne]
extra_acne_df = pd.DataFrame(extra_acne, label).reset_index()
extra_acne_df.columns =['skin_disorder_name', 'images']
extra_acne_df.head()

Unnamed: 0,skin_disorder_name,images
0,acne,07Acne081101.jpg
1,acne,07Acne0811011 - Copy.jpg
2,acne,07Acne0811011.jpg
3,acne,07AcnePittedScars.jpg
4,acne,07AcnePittedScars1 - Copy.jpg


**i. Moving acne images in the Images folder to their own folder**

In [8]:
# Getting the acne images file names
original_acne_img = [image_name for image_name in os.listdir('Images/') if 'acne' in image_name] 

# Confirming the number of acne images before any cleaning
print('There are', len(original_acne_img),'acne images')
original_acne_img[:10]

There are 718 acne images


['acne affecting the back images0.jpg',
 'acne affecting the back images1.jpg',
 'acne affecting the back images10.jpg',
 'acne affecting the back images11.jpg',
 'acne affecting the back images12.jpg',
 'acne affecting the back images13.jpg',
 'acne affecting the back images14.jpg',
 'acne affecting the back images15.jpg',
 'acne affecting the back images16.jpg',
 'acne affecting the back images17.jpg']

In [9]:
# Creating a new folder with just acne images to make cleaning easier
folder_name = 'cleaned_images/acne_images/'

# Checking if the folder exists and deleting it if it exists
if os.path.exists(folder_name):
    # deleting the folder and its contents
    shutil.rmtree(folder_name)

# create the new folder
os.mkdir(folder_name)

# Moving the images into that folder
for img in original_acne_img:
    origin = os.path.join('Images/', img)
    destination = os.path.join(folder_name, img)
    shutil.copy(origin, destination)

In [10]:
# Confirming that the number of acne images after moving them to a separate folder is still 718
acne_img = [image_name for image_name in os.listdir('cleaned_images/acne_images/')] 
print('There are', len(acne_img),'acne images')

There are 718 acne images


**ii. Dropping links from the 'images' column in the acne_df and replacing them with the image name**

In [11]:
# So that the two dataframes can match, we dropped the image links in  acne_df 
# and replaced them with the image names

acne_images = pd.DataFrame(acne_img, columns=['images'])
acne_df = acne_df.copy()
acne_df.drop('images', axis=1, inplace=True)
acne_df['images'] = acne_images['images'].values
acne_df.head()

Unnamed: 0,skin_disorder_name,images
0,acne affecting the back images,acne affecting the back images0.jpg
1,acne affecting the back images,acne affecting the back images1.jpg
2,acne affecting the back images,acne affecting the back images10.jpg
3,acne affecting the back images,acne affecting the back images11.jpg
4,acne affecting the back images,acne affecting the back images12.jpg


**iii. Joining the two dataframes**

In [12]:
# Creating a dataframe with all of the acne images

acne_df_complete = pd.concat([acne_df, extra_acne_df], axis=0).reset_index()
acne_df_complete.drop('index', axis=1, inplace=True)
print(acne_df_complete.info())
acne_df_complete.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2540 entries, 0 to 2539
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   skin_disorder_name  2540 non-null   object
 1   images              2540 non-null   object
dtypes: object(2)
memory usage: 39.8+ KB
None


Unnamed: 0,skin_disorder_name,images
0,acne affecting the back images,acne affecting the back images0.jpg
1,acne affecting the back images,acne affecting the back images1.jpg
2,acne affecting the back images,acne affecting the back images10.jpg
3,acne affecting the back images,acne affecting the back images11.jpg
4,acne affecting the back images,acne affecting the back images12.jpg


**iv. Combining the images into one folder**

In [13]:
# This was done by moving the extra images into the acne folder
for img in extra_acne:
    origin = os.path.join('extra_images/extra_acne_images/', img)
    destination = os.path.join('cleaned_images/acne_images/', img)
    shutil.copy(origin, destination)

In [14]:
# Confirming that the total acne images is 2540 before any cleaning

acne_img = [image_name for image_name in os.listdir('cleaned_images/acne_images/')] 
print('There are a total of', len(acne_img),'acne images')

There are a total of 2540 acne images


**v. Removing duplicated images from the folder**

In [15]:
# Define a threshold for image similarity
def drop_duplicated_images(folder):
    threshold = 8
    # Define a dictionary to store the hash values and file paths of the images
    image_hashes = {}
    duplicated_images = []
    # Loop through all the image files in a directory
    for filename in os.listdir(folder):
        # Load the image file
        image = Image.open(os.path.join(folder, filename))

         # Compute the hash value of the image using the average hash algorithm
        hash_value = imagehash.average_hash(image)

        # Check if the hash value is already in the dictionary
        if hash_value in image_hashes:
            # If a similar hash value already exists, delete the duplicate image
            duplicated_images.append(filename)
            os.remove(os.path.join(folder, filename))
        else:
             # Otherwise, add the hash value and file path to the dictionary
            image_hashes[hash_value] = os.path.join(folder, filename)
    return duplicated_images

In [16]:
# Dropping duplicates
duplicated_images = drop_duplicated_images('cleaned_images/acne_images/')
duplicated_images[:5]

['07Acne0811011.jpg',
 '07AcnePittedScars1.jpg',
 '07RosaceaFulFAce1.jpg',
 '07RosaceaOK08280631.jpg',
 '07SteroidPerioral11.jpg']

In [17]:
acne_img = [image_name for image_name in os.listdir('cleaned_images/acne_images/')] 
print('There are', len(acne_img),'acne images after removing duplicated images')

There are 2169 acne images after removing duplicated images


In [18]:
# Getting the indexes of the duplicated images so that they can be dropped from the acne_df_complete too.

duplicated_indexes = [acne_df_complete[acne_df_complete['images'] == image_name].index[0] \
                      for image_name in acne_df_complete['images']\
                      if image_name in duplicated_images]
duplicated_indexes[:10]

[369, 377, 394, 430, 448, 449, 450, 451, 452, 453]

In [19]:
# Dropping duplicated images from the dataframe.
acne_df_complete = acne_df_complete.copy()
acne_df_complete.drop(index=duplicated_indexes, inplace=True)
acne_df_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2169 entries, 0 to 2538
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   skin_disorder_name  2169 non-null   object
 1   images              2169 non-null   object
dtypes: object(2)
memory usage: 50.8+ KB


Acne affecting the back images, Acne affecting the face images all have correct images. The Only change that will be made is changing the name to acne. </br>
Acne and other follicular disorder images has a collection of different images. Only images that have acne as a specific label will be included, the others will be dropped from the dataset. </br>
***
***Dealing with the collection of different images in Acne and other follicular disorder images***</br>
After careful evaluation of the images, the images that represent acne are:</br>
>>> **[299, 301, 302, 305, 306, 312, 324, 327, 331, 332, 334, 335, 336,</br>, 340, 352, 353, 356, 357, 358, 360, 365, 370, 379, 383, 386, 391, </br>, 394, 399, 400, 401, 404, 406, 407, 410, 410, 412, 414, 418, 435, </br> 439, 440, 442]**

In [20]:
# Note 📝: The indexes were confirmed to be the same even after merging the two dataframes
        #: This is because the acne_df is at the top in the complete dataframe

# indexes of the images in 'acne and other follicular disorder images'
indexes = acne_df[acne_df['skin_disorder_name'] == 'acne and other follicular disorder images'].index

# indexes of the acne images in 'acne and other follicular disorder images'
acne_indexes = [299, 301, 302, 305, 306, 312, 324, 327, 331, 332, 334, 335, 336,
                340, 352, 353, 356, 357, 358, 360, 365, 370, 379, 383, 386, 391,
                394, 399, 400, 401, 404, 406, 407, 410, 410, 412, 414, 418, 435,
                439, 440, 442]

# indexes of the other follicular disorder images in 'acne and other follicular disorder images'. This indexes will be dropped.
to_drop = []

for index in  indexes:
    if (index not in acne_indexes) and (index not in duplicated_indexes):
        to_drop.append(index)

# dropping indexes in to_drop
acne_df_complete.drop(to_drop, axis = 0, inplace=True)

In [21]:
# After dropping non-acne images, we still have 328 images left
acne_df_complete.shape

(2060, 2)

In [22]:
# dropping those images from the acne_images folder

# Finding the image file names to be dropped from the folder
img_to_drop = []

for index in to_drop:
    for img_name in original_acne_img:
        if str(index) in img_name:
            img_to_drop.append(img_name)

# Dropping those images form the acne_images folder
for filename in img_to_drop:
    os.remove(os.path.join("cleaned_images/acne_images/", filename))

# Confirming that the number of images left is 328
acne_img = [image_name for image_name in os.listdir('cleaned_images/acne_images/')] 
print('There are', len(acne_img),'acne images left.')

There are 2060 acne images left.


**vi. Changing the label to just acne**

In [23]:
acne_df_complete['skin_disorder_name'] = 'acne'
acne_df_complete.head(10)

Unnamed: 0,skin_disorder_name,images
0,acne,acne affecting the back images0.jpg
1,acne,acne affecting the back images1.jpg
2,acne,acne affecting the back images10.jpg
3,acne,acne affecting the back images11.jpg
4,acne,acne affecting the back images12.jpg
5,acne,acne affecting the back images13.jpg
6,acne,acne affecting the back images14.jpg
7,acne,acne affecting the back images15.jpg
8,acne,acne affecting the back images16.jpg
9,acne,acne affecting the back images17.jpg


## **Cleaning  Eczema images**

In [24]:
# Labels representing eczema in Dermnet's scraped data.

eczema_labels = image_df[(image_df['skin_disorder_name'].str.contains('eczema')) | \
                         (image_df['skin_disorder_name'].str.contains('atopic dermatitis images')) |\
                         (image_df['skin_disorder_name'].str.contains('hand dermatitis images')) |\
                         (image_df['skin_disorder_name'] == 'dermatitis images') |\
                         (image_df['skin_disorder_name'].str.contains('nummular dermatitis images'))] \
                         ['skin_disorder_name'].unique()
len(eczema_labels)

8

In [25]:
# Creating a dataframe with just eczema labels for easier cleaning

eczema_df = image_df[(image_df['skin_disorder_name'] == eczema_labels[0]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[1]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[2]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[3]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[4]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[5]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[6]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[7]) 
                 ]
eczema_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 631 entries, 1058 to 8989
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   skin_disorder_name  631 non-null    object
 1   images              631 non-null    object
dtypes: object(2)
memory usage: 14.8+ KB


##### **Extra eczema images**

In [26]:
extra_eczema1 = [image_name for image_name in os.listdir('extra_images/extra_eczema')]
extra_eczema1[:10]

['0_0.jpg',
 '0_1.jpg',
 '0_10.jpg',
 '0_11.jpg',
 '0_12.jpg',
 '0_13.jpg',
 '0_14.jpg',
 '0_15.jpg',
 '0_16.jpg',
 '0_17.jpg']

In [27]:
# The folder has a mixture of images. We will filter out the eczema images only

extra_eczema_images = [image_name for image_name in extra_eczema1\
                        if ('dermatitis' in image_name) |\
                        ('eczema' in image_name) |\
                        ('pompholyx' in image_name) |\
                        ('dyshidrotic' in image_name)]
extra_eczema_images[:10]

['t-03eczema091205.jpg',
 't-eczema-acute-1.jpg',
 't-eczema-acute-11.jpg',
 't-eczema-acute-12.jpg',
 't-eczema-acute-13.jpg',
 't-eczema-acute-14.jpg',
 't-eczema-acute-15.jpg',
 't-eczema-acute-2.jpg',
 't-eczema-acute-20.jpg',
 't-eczema-acute-21.jpg']

In [28]:
# Moving this images into their own folder called 'extra_eczema_images_clean'
folder_name = 'extra_images/extra_eczema_images_clean/'

# Checking if the folder exists and deleting it if it exists
if os.path.exists(folder_name):
    # deleting the folder and its contents
    shutil.rmtree(folder_name)

# create the new folder
os.mkdir(folder_name)

for img in extra_eczema_images:
    origin = os.path.join('extra_images/extra_eczema/', img)
    destination = os.path.join(folder_name, img)
    shutil.copy(origin, destination)

In [29]:
#Creating a dataframe for the extra eczema images

label =['eczema' for img in extra_eczema_images]
extra_eczema_df = pd.DataFrame(extra_eczema_images, label).reset_index()
extra_eczema_df.columns =['skin_disorder_name', 'images']
print(extra_eczema_df.shape)
extra_eczema_df.head()

(1059, 2)


Unnamed: 0,skin_disorder_name,images
0,eczema,t-03eczema091205.jpg
1,eczema,t-eczema-acute-1.jpg
2,eczema,t-eczema-acute-11.jpg
3,eczema,t-eczema-acute-12.jpg
4,eczema,t-eczema-acute-13.jpg


**i. Moving eczema images in the Image folder to their own folder**

In [30]:
# Getting the eczema images file names
eczema_img = [image_name for image_name in os.listdir('Images/') if ('eczema' in image_name) |
                                                                    ('atopic dermatitis images' in image_name) |
                                                                    ('hand dermatitis images' in image_name) | 
                                                                    (image_name.startswith('dermatitis images'))|
                                                                    ('nummular dermatitis images' in image_name)
                                                                     ] 

# Confirming the number of eczema images before any cleaning
print('There are', len(eczema_img),'eczema images')
eczema_img[:5]

There are 631 eczema images


['atopic dermatitis images1058.jpg',
 'atopic dermatitis images1059.jpg',
 'atopic dermatitis images1060.jpg',
 'atopic dermatitis images1061.jpg',
 'atopic dermatitis images1062.jpg']

In [31]:
# Creating a new folder with just eczema images to make cleaning easier
folder_name = 'cleaned_images/eczema_images/'

# Checking if the folder exists and deleting it if it exists
if os.path.exists(folder_name):
    # deleting the folder and its contents
    shutil.rmtree(folder_name)

# create the new folder
os.mkdir(folder_name)

# Moving the images into that folder
for img in eczema_img:
    origin = os.path.join('Images/', img)
    destination = os.path.join(folder_name, img)
    shutil.copy(origin, destination)

In [32]:
# Confirming that the number of eczema images after moving them to a separate folder is still 631
eczema_img = [image_name for image_name in os.listdir('cleaned_images/eczema_images/')] 
print('There are', len(eczema_img),' eczema images')

There are 631  eczema images


**ii. Dropping links from the 'images' column in the eczema_df and replacing them with the image name**

In [33]:
# So that the two dataframes can match, we dropped the image links in  eczema_df 
# and replace them with the image names

eczema_images = pd.DataFrame(eczema_img, columns=['images'])
eczema_df = eczema_df.copy()
eczema_df.drop('images', axis=1, inplace=True)
eczema_df['images'] = eczema_images['images'].values
eczema_df.head()

Unnamed: 0,skin_disorder_name,images
1058,atopic dermatitis images,atopic dermatitis images1058.jpg
1059,atopic dermatitis images,atopic dermatitis images1059.jpg
1060,atopic dermatitis images,atopic dermatitis images1060.jpg
1061,atopic dermatitis images,atopic dermatitis images1061.jpg
1062,atopic dermatitis images,atopic dermatitis images1062.jpg


**iii. Joining the two dataframes**

In [34]:
# Creating a dataframe with all of the eczema images

eczema_df_complete = pd.concat([eczema_df, extra_eczema_df], axis=0).reset_index()
eczema_df_complete.drop('index', axis=1, inplace=True)
print(eczema_df_complete.info())
eczema_df_complete.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1690 entries, 0 to 1689
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   skin_disorder_name  1690 non-null   object
 1   images              1690 non-null   object
dtypes: object(2)
memory usage: 26.5+ KB
None


Unnamed: 0,skin_disorder_name,images
0,atopic dermatitis images,atopic dermatitis images1058.jpg
1,atopic dermatitis images,atopic dermatitis images1059.jpg
2,atopic dermatitis images,atopic dermatitis images1060.jpg
3,atopic dermatitis images,atopic dermatitis images1061.jpg
4,atopic dermatitis images,atopic dermatitis images1062.jpg


**iv. Combining the images into one folder**

In [35]:
# This was done by moving the extra images into the eczema folder
for img in extra_eczema_images:
    origin = os.path.join('extra_images/extra_eczema_images_clean/', img)
    destination = os.path.join('cleaned_images/eczema_images/', img)
    shutil.copy(origin, destination)

In [36]:
# Confirming that the total acne images is 1690 before any cleaning

eczema_img = [image_name for image_name in os.listdir('cleaned_images/eczema_images/')] 
print('There are a total of', len(eczema_img),'eczema images')

There are a total of 1690 eczema images


**v. Removing duplicated images from the folder**

In [37]:
# Using a function created earlier to drop duplicates

duplicated_images = drop_duplicated_images('cleaned_images/eczema_images/')
duplicated_images[:5]

['atopic eczema images1147.jpg',
 'atopic eczema images1148.jpg',
 'atopic eczema images1149.jpg',
 'atopic eczema images1150.jpg',
 'atopic eczema images1151.jpg']

In [38]:
# Confirming the number of images after dropping duplicates

eczema_img = [image_name for image_name in os.listdir('cleaned_images/eczema_images/')] 
print('There are', len(eczema_img),'eczema images after removing duplicated images')

There are 1298 eczema images after removing duplicated images


In [39]:
# Getting the indexes of the duplicated images so that they can be dropped from the eczema_df_complete too.

duplicated_indexes = [eczema_df_complete[eczema_df_complete['images'] == image_name].index[0] \
                      for image_name in eczema_df_complete['images']\
                      if image_name in duplicated_images]
duplicated_indexes[:10]

[89, 90, 91, 92, 93, 94, 95, 96, 97, 98]

In [40]:
# Dropping duplicated images from the dataframe.
eczema_df_complete = eczema_df_complete .copy()
eczema_df_complete .drop(duplicated_indexes, axis=0, inplace=True)
eczema_df_complete .info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1298 entries, 0 to 1689
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   skin_disorder_name  1298 non-null   object
 1   images              1298 non-null   object
dtypes: object(2)
memory usage: 30.4+ KB


**vi. Changing the label to just eczema**

In [42]:
eczema_df_complete['skin_disorder_name'] = 'eczema'
eczema_df_complete.head(10)

Unnamed: 0,skin_disorder_name,images
0,eczema,atopic dermatitis images1058.jpg
1,eczema,atopic dermatitis images1059.jpg
2,eczema,atopic dermatitis images1060.jpg
3,eczema,atopic dermatitis images1061.jpg
4,eczema,atopic dermatitis images1062.jpg
5,eczema,atopic dermatitis images1063.jpg
6,eczema,atopic dermatitis images1064.jpg
7,eczema,atopic dermatitis images1065.jpg
8,eczema,atopic dermatitis images1066.jpg
9,eczema,atopic dermatitis images1067.jpg


### Basal cell carcinoma and basal cell epithelioma

**Meaning**<br>
Basal cell epithelioma is a benign tumor that arises from the basal cells in the skin. It is usually slow-growing and rarely spreads to other parts of the body. <br>
On the other hand, basal cell carcinoma (BCC) is a malignant tumor that also arises from the basal cells in the skin. It is the most common type of skin cancer. Malignant tumors are more dangerous than benign tumors because they can grow quickly and invade and destroy surrounding tissues, leading to significant damage to the body's normal functions. <br>

**Causes**<br>
Both basal cell epithelioma and basal cell carcinoma are primarily caused by exposure to ultraviolet (UV) radiation from the sun or tanning beds. Prolonged exposure to UV radiation damages the DNA in the skin cells, leading to mutations that can cause the cells to grow and divide uncontrollably, eventually forming a tumor. Other factors that may increase the risk of developing these skin cancers include having fair skin, a history of sunburns or intense sun exposure, a weakened immune system, a family history of skin cancer, and certain genetic conditions.

**Symptoms**<br>
Basal cell epithelioma and basal cell carcinoma can have similar symptoms, but there are some differences. Both conditions typically present as raised, pearly, or translucent bumps or lesions on the skin that may be pink, red, or white in color. These lesions can sometimes ulcerate or bleed, and may develop a crust or scab. However, basal cell carcinoma can sometimes appear as a flat, scaly, or pigmented patch on the skin, which is less common in basal cell epithelioma.

**Treatment**<br>
Basal cell epithelioma and basal cell carcinoma are treated similarly, but the treatment plan depends on the size, location, and extent of the tumor. Surgical removal of the tumor is the primary treatment, and there are different techniques available, such as excision, curettage and electrodesiccation, Mohs surgery, or radiation therapy. For small tumors, excision may be sufficient, while Mohs surgery is recommended for larger or more advanced tumors to ensure complete removal. Radiation therapy may be used as an alternative to surgery for some cases. Systemic chemotherapy or immunotherapy is rarely used for advanced basal cell carcinoma that has spread to other parts of the body.

#### i. Cleaning Basal Cell Carcinoma

In [None]:
# image labels that contain the string basal and bcc
print(image_df[image_df['skin_disorder_name'].str.contains('basal')]['skin_disorder_name'].unique())
print('****************************************')
print(image_df[image_df['skin_disorder_name'].str.contains('bcc')]['skin_disorder_name'].unique())

['basal cell carcinoma affecting the ear images'
 'basal cell carcinoma affecting the eyelid images'
 'basal cell carcinoma affecting the face images'
 'basal cell carcinoma affecting the nose images'
 'basal cell carcinoma affecting the trunk images'
 'basal cell epithelioma affecting the ear images'
 'basal cell epithelioma affecting the eyelid images'
 'basal cell epithelioma affecting the face images'
 'basal cell epithelioma affecting the nose images'
 'basal cell epithelioma affecting the trunk images'
 'basalioma affecting the ear images'
 'basalioma affecting the eyelid images'
 'basalioma affecting the face images'
 'basalioma affecting the nose images'
 'basalioma affecting the trunk images'
 'nodular basal cell carcinoma images']
****************************************
['bcc affecting the ear images' 'bcc affecting the eyelid images'
 'bcc affecting the face images' 'bcc affecting the nose images'
 'bcc affecting the trunk images' 'nodular bcc images']


In [None]:
# dataframe with bcc(basal cell carcinoma) labels
bcc_df = image_df[(image_df['skin_disorder_name'].str.contains('basal cell carcinoma')) | \
                  (image_df['skin_disorder_name'].str.contains('bcc'))]
print(bcc_df.shape)
bcc_df.head()

(442, 2)


Unnamed: 0,skin_disorder_name,images
1450,bcc affecting the ear images,https://dermnetnz.org/assets/Uploads/lesions/b...
1451,bcc affecting the ear images,https://dermnetnz.org/assets/Uploads/lesions/b...
1452,bcc affecting the ear images,https://dermnetnz.org/assets/Uploads/lesions/b...
1453,bcc affecting the ear images,https://dermnetnz.org/assets/Uploads/lesions/b...
1454,bcc affecting the ear images,https://dermnetnz.org/assets/Uploads/lesions/b...


In [None]:
# Getting the bcc images file names
bcc_img = [image_name for image_name in os.listdir('Images/') if ('basal cell carcinoma' in image_name) | ('bcc' in image_name)] 

# Creating a new folder with just bcc images to make cleaning easier
os.mkdir('bcc_images/')
for img in bcc_img:
    origin = os.path.join('Images/', img)
    destination = os.path.join('bcc_images/', img)
    shutil.copy(origin, destination)
    
# Confirming that the number of bcc images after moving them to a separate folder is still 442
bcc_img = [image_name for image_name in os.listdir('bcc_images/')] 
print('There are', len(bcc_img),'bcc images')

There are 442 bcc images


Remove duplicate images

In [None]:
# function to remove duplicate images
def drop_duplicates(disease_images, disease_df):
    # Define a threshold for image similarity
    threshold = 8

    # Define a dictionary to store the hash values and file paths of the images
    image_hashes = {}
    duplicated_images = []
    # Loop through all the image files in a directory
    for filename in os.listdir(disease_images):
        # Load the image file
        image = Image.open(os.path.join(disease_images, filename))

        # Compute the hash value of the image using the average hash algorithm
        hash_value = imagehash.average_hash(image)

        # Check if the hash value is already in the dictionary
        if hash_value in image_hashes:
            # If a similar hash value already exists, delete the duplicate image
            duplicated_images.append(filename)
            os.remove(os.path.join(disease_images, filename))
            print(f"Duplicate image deleted: {filename}")
        else:
            # Otherwise, add the hash value and file path to the dictionary
            image_hashes[hash_value] = os.path.join(disease_images, filename)

    # Getting the indexes of the duplicated images so that they can be dropped from the disease_df too.
    duplicated_indexes = []
    for img_name in duplicated_images:
        match = re.search(r'\d+', img_name)
        if match:
            number = match.group()
            duplicated_indexes.append(int(number))

    # Dropping duplicated images from the dataframe.
    disease_df = disease_df.drop(duplicated_indexes, axis=0)
    
    return disease_df

In [None]:
# drop duplicates from image folder and dataframe
bcc_df = drop_duplicates('bcc_images', bcc_df)

Duplicate image deleted: basal cell carcinoma affecting the face images2003.jpg
Duplicate image deleted: bcc affecting the ear images1450.jpg
Duplicate image deleted: bcc affecting the ear images1451.jpg
Duplicate image deleted: bcc affecting the ear images1452.jpg
Duplicate image deleted: bcc affecting the ear images1453.jpg
Duplicate image deleted: bcc affecting the ear images1454.jpg
Duplicate image deleted: bcc affecting the ear images1455.jpg
Duplicate image deleted: bcc affecting the eyelid images1456.jpg
Duplicate image deleted: bcc affecting the eyelid images1457.jpg
Duplicate image deleted: bcc affecting the eyelid images1458.jpg
Duplicate image deleted: bcc affecting the eyelid images1459.jpg
Duplicate image deleted: bcc affecting the eyelid images1460.jpg
Duplicate image deleted: bcc affecting the eyelid images1461.jpg
Duplicate image deleted: bcc affecting the eyelid images1462.jpg
Duplicate image deleted: bcc affecting the eyelid images1463.jpg
Duplicate image deleted: bcc

Duplicate image deleted: bcc affecting the trunk images1582.jpg
Duplicate image deleted: bcc affecting the trunk images1583.jpg
Duplicate image deleted: bcc affecting the trunk images1584.jpg
Duplicate image deleted: bcc affecting the trunk images1585.jpg
Duplicate image deleted: bcc affecting the trunk images1586.jpg
Duplicate image deleted: bcc affecting the trunk images1587.jpg
Duplicate image deleted: bcc affecting the trunk images1588.jpg
Duplicate image deleted: bcc affecting the trunk images1589.jpg
Duplicate image deleted: bcc affecting the trunk images1590.jpg
Duplicate image deleted: bcc affecting the trunk images1591.jpg
Duplicate image deleted: bcc affecting the trunk images1592.jpg
Duplicate image deleted: bcc affecting the trunk images1593.jpg
Duplicate image deleted: bcc affecting the trunk images1594.jpg
Duplicate image deleted: bcc affecting the trunk images1595.jpg
Duplicate image deleted: bcc affecting the trunk images1596.jpg
Duplicate image deleted: bcc affecting t

In [None]:
# number of images and shape of dataframe after removing duplicates
bcc_img = [image_name for image_name in os.listdir('bcc_images/')] 
print('Number of bcc images after removing duplicated images:', len(bcc_img))
print(f'Shape of bcc dataframe: {bcc_df.shape}')

Number of bcc images after removing duplicated images: 217
Shape of bcc dataframe: (217, 2)


In [None]:
# rename the values of skin_disorder_name column to basal cell carcinoma
bcc_df['skin_disorder_name'] = bcc_df['skin_disorder_name'].replace(bcc_df['skin_disorder_name'].unique(), 'basal cell carcinoma')
bcc_df.head(4)

Unnamed: 0,skin_disorder_name,images
1927,basal cell carcinoma,https://dermnetnz.org/assets/Uploads/lesions/b...
1928,basal cell carcinoma,https://dermnetnz.org/assets/Uploads/lesions/b...
1929,basal cell carcinoma,https://dermnetnz.org/assets/Uploads/lesions/b...
1930,basal cell carcinoma,https://dermnetnz.org/assets/Uploads/lesions/b...


#### ii.Cleaning Basal Cell Epithelioma

In [None]:
# dataframe with bce(basal cell epithelioma) labels
bce_df = image_df[image_df['skin_disorder_name'].str.contains('basal cell epithelioma')] 
print(bce_df.shape)
bce_df.head()

(161, 2)


Unnamed: 0,skin_disorder_name,images
2088,basal cell epithelioma affecting the ear images,https://dermnetnz.org/assets/Uploads/lesions/b...
2089,basal cell epithelioma affecting the ear images,https://dermnetnz.org/assets/Uploads/lesions/b...
2090,basal cell epithelioma affecting the ear images,https://dermnetnz.org/assets/Uploads/lesions/b...
2091,basal cell epithelioma affecting the ear images,https://dermnetnz.org/assets/Uploads/lesions/b...
2092,basal cell epithelioma affecting the ear images,https://dermnetnz.org/assets/Uploads/lesions/b...


In [None]:
# Getting the bce images file names
bce_img = [image_name for image_name in os.listdir('Images/') if 'basal cell epithelioma' in image_name] 

# Creating a new folder with just bce images to make cleaning easier
os.mkdir('bce_images/')
for img in bce_img:
    origin = os.path.join('Images/', img)
    destination = os.path.join('bce_images/', img)
    shutil.copy(origin, destination)
    
# Confirming that the number of bce images after moving them to a separate folder is still 161
bce_img = [image_name for image_name in os.listdir('bce_images/')] 
print('There are', len(bce_img),'bce images')

There are 161 bce images


In [None]:
# drop duplicates from image folder and dataframe
bce_df= drop_duplicates('bce_images', bce_df)

Duplicate image deleted: basal cell epithelioma affecting the face images2164.jpg


In [None]:
# number of images and shape of dataframe after removing duplicates
bce_img = [image_name for image_name in os.listdir('bce_images/')] 
print('Number of bce images after removing duplicated images:', len(bce_img))
print(f'Shape of bce dataframe: {bce_df.shape}')

Number of bce images after removing duplicated images: 160
Shape of bce dataframe: (160, 2)


In [None]:
# rename the values of skin_disorder_name column to basal cell epithelioma
bce_df['skin_disorder_name'] = bce_df['skin_disorder_name'].replace(bce_df['skin_disorder_name'].unique(), 'basal cell epithelioma')
bce_df.head(4)

Unnamed: 0,skin_disorder_name,images
2088,basal cell epithelioma,https://dermnetnz.org/assets/Uploads/lesions/b...
2089,basal cell epithelioma,https://dermnetnz.org/assets/Uploads/lesions/b...
2090,basal cell epithelioma,https://dermnetnz.org/assets/Uploads/lesions/b...
2091,basal cell epithelioma,https://dermnetnz.org/assets/Uploads/lesions/b...


### Actinic keratosis

**Meaning** <br>
Actinic keratosis(AK) is a skin condition that is caused by long-term exposure to UV rays, resulting in the formation of rough, scaly patches on the skin. It is considered a precancerous condition because it has the potential to develop into squamous cell carcinoma, a type of skin cancer

**Causes** <br>
The primary cause of actinic keratosis is long-term exposure to UV rays from the sun or other sources such as tanning beds. People with fair skin, light-colored hair, and light-colored eyes are at a higher risk of developing AK. Other risk factors include a history of frequent sunburns, a weakened immune system, and exposure to chemicals such as coal tar or arsenic.

**Symptoms** <br>
The most common symptom of actinic keratosis is the formation of rough, scaly patches or lesions on the skin. These patches can be pink, red, or brown in color and may feel like sandpaper. They are usually found on areas of the skin that are frequently exposed to the sun, such as the face, scalp, ears, neck, hands, and arms. In some cases, the patches may itch or burn, and they may become inflamed or bleed if they are scratched or rubbed.

**Treatment** <br>
The treatment of actinic keratosis depends on the severity of the condition. Mild cases may be treated with topical creams or gels that contain medications such as imiquimod, fluorouracil, or diclofenac. These medications work by stimulating the immune system or causing the abnormal cells to die off. In more severe cases, cryotherapy (freezing the lesions with liquid nitrogen) or curettage (scraping off the lesions with a special tool) may be necessary. In rare cases where the lesions have developed into skin cancer, surgical removal may be required. It is also important to take steps to prevent further damage to the skin, such as wearing protective clothing and sunscreen, avoiding tanning beds, and staying out of the sun during peak hours.


In [None]:
# image labels with the name keratosis
print(image_df[image_df['skin_disorder_name'].str.contains('keratosis')]['skin_disorder_name'].unique())

['actinic keratosis affecting the face images'
 'actinic keratosis affecting the hand images'
 'actinic keratosis affecting the legs and feet images'
 'actinic keratosis affecting the scalp images'
 'actinic keratosis dermoscopy images'
 'actinic keratosis on the nose images'
 'actinic keratosis treated with imiquimod images'
 'granular parakeratosis images' 'keratosis pilaris images'
 'seborrhoeic keratosis dermoscopy images' 'seborrhoeic keratosis images'
 'solar keratosis affecting the face images'
 'solar keratosis affecting the hand images'
 'solar keratosis affecting the legs and feet images'
 'solar keratosis affecting the scalp images'
 'solar keratosis on the nose images'
 'solar keratosis treated with imiquimod images']


Actinic keratosis is also known as solar keratosis or senile keratosis

In [None]:
# dataframe with actinic keratosis and solar keratosis labels
keratosis_df = image_df[(image_df['skin_disorder_name'].str.contains('actinic keratosis')) | \
                  (image_df['skin_disorder_name'].str.contains('solar keratosis'))]
print(keratosis_df.shape)
keratosis_df.head()

(427, 2)


Unnamed: 0,skin_disorder_name,images
504,actinic keratosis affecting the face images,https://dermnetnz.org/assets/Uploads/lesions/a...
505,actinic keratosis affecting the face images,https://dermnetnz.org/assets/Uploads/lesions/a...
506,actinic keratosis affecting the face images,https://dermnetnz.org/assets/Uploads/lesions/a...
507,actinic keratosis affecting the face images,https://dermnetnz.org/assets/Uploads/lesions/a...
508,actinic keratosis affecting the face images,https://dermnetnz.org/assets/Uploads/lesions/a...


In [None]:
# Getting the keratosis images file names
keratosis_img = [image_name for image_name in os.listdir('Images/') if ('actinic keratosis' in image_name) | ('solar keratosis' in image_name)] 

# Creating a new folder with just keratosis images to make cleaning easier
os.mkdir('keratosis_images/')
for img in keratosis_img:
    origin = os.path.join('Images/', img)
    destination = os.path.join('keratosis_images/', img)
    shutil.copy(origin, destination)
    
# Confirming that the number of keratosis images after moving them to a separate folder is still 161
keratosis_img = [image_name for image_name in os.listdir('keratosis_images/')] 
print('There are', len(keratosis_img),'keratosis images')

There are 427 keratosis images


In [None]:
# drop duplicates from image folder and dataframe
keratosis_df= drop_duplicates('keratosis_images', keratosis_df)

Duplicate image deleted: solar keratosis affecting the face images12220.jpg
Duplicate image deleted: solar keratosis affecting the face images12221.jpg
Duplicate image deleted: solar keratosis affecting the face images12222.jpg
Duplicate image deleted: solar keratosis affecting the face images12223.jpg
Duplicate image deleted: solar keratosis affecting the face images12224.jpg
Duplicate image deleted: solar keratosis affecting the face images12225.jpg
Duplicate image deleted: solar keratosis affecting the face images12226.jpg
Duplicate image deleted: solar keratosis affecting the face images12227.jpg
Duplicate image deleted: solar keratosis affecting the face images12228.jpg
Duplicate image deleted: solar keratosis affecting the face images12229.jpg
Duplicate image deleted: solar keratosis affecting the face images12230.jpg
Duplicate image deleted: solar keratosis affecting the face images12231.jpg
Duplicate image deleted: solar keratosis affecting the face images12232.jpg
Duplicate im

Duplicate image deleted: solar keratosis affecting the legs and feet images12330.jpg
Duplicate image deleted: solar keratosis affecting the legs and feet images12331.jpg
Duplicate image deleted: solar keratosis affecting the legs and feet images12332.jpg
Duplicate image deleted: solar keratosis affecting the legs and feet images12333.jpg
Duplicate image deleted: solar keratosis affecting the legs and feet images12334.jpg
Duplicate image deleted: solar keratosis affecting the legs and feet images12335.jpg
Duplicate image deleted: solar keratosis affecting the legs and feet images12336.jpg
Duplicate image deleted: solar keratosis affecting the legs and feet images12337.jpg
Duplicate image deleted: solar keratosis affecting the legs and feet images12338.jpg
Duplicate image deleted: solar keratosis affecting the legs and feet images12339.jpg
Duplicate image deleted: solar keratosis affecting the legs and feet images12340.jpg
Duplicate image deleted: solar keratosis affecting the legs and f

In [None]:
# number of images and shape of dataframe after removing duplicates
keratosis_img = [image_name for image_name in os.listdir('keratosis_images/')] 
print('Number of keratosis images after removing duplicated images:', len(keratosis_img))
print(f'Shape of keratosis dataframe: {keratosis_df.shape}')

Number of keratosis images after removing duplicated images: 224
Shape of keratosis dataframe: (224, 2)


In [None]:
# rename the values of skin_disorder_name column to actinic keratosis
keratosis_df['skin_disorder_name'] = keratosis_df['skin_disorder_name'].replace(keratosis_df['skin_disorder_name'].unique(), 'actinic keratosis')
keratosis_df.head(4)

Unnamed: 0,skin_disorder_name,images
504,actinic keratosis,https://dermnetnz.org/assets/Uploads/lesions/a...
505,actinic keratosis,https://dermnetnz.org/assets/Uploads/lesions/a...
506,actinic keratosis,https://dermnetnz.org/assets/Uploads/lesions/a...
507,actinic keratosis,https://dermnetnz.org/assets/Uploads/lesions/a...
