# Data cleaning

**import required libraries**

In [1]:
# import required libraries
import pandas as pd
import shutil
import os
from PIL import Image
import imagehash 
import re

**Loading the scrapped data from DermNet.**

In [2]:
# load and preview dataset
image_df = pd.read_csv('Data/data1-294.csv')
print(image_df.shape)
image_df.head()

(13992, 2)


Unnamed: 0,skin_disorder_name,images
0,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...
1,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...
2,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...
3,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...
4,acne affecting the back images,https://dermnetnz.org/assets/Uploads/acne/acne...


## **<u>Acne</u>**

**Meaning**<br>
Acne is a common skin condition that occurs when hair follicles become clogged with oil and dead skin cells. This leads to the formation of pimples, blackheads, whiteheads, and sometimes deeper cysts. Acne usually appears on the face, neck, chest, back, and shoulders, and can affect people of all ages, although it is most common during puberty.<br>

**Causes**<br>
The causes of acne are multifactorial and can include hormonal imbalances, genetics, stress, certain medications, and an overproduction of sebum, the oily substance that lubricates the skin. Certain factors such as diet and hygiene practices have also been implicated in the development of acne, although the evidence for these is less clear.<br>

**Symptoms**<br>
The symptoms of acne can vary depending on the severity of the condition. Mild acne may only present with a few blackheads or whiteheads, while moderate acne can involve a combination of pimples, blackheads, and whiteheads. Severe acne may include deep, painful cysts that can lead to scarring. Acne can also have a significant impact on a person's self-esteem and mental health, particularly if it is severe or persistent.

**Treatment**<br>
reatment options for acne depend on the severity of the condition. Mild acne can often be managed with over-the-counter topical treatments that contain benzoyl peroxide or salicylic acid. These products work by reducing the amount of oil on the skin and unclogging pores. More severe acne may require prescription medications, such as topical retinoids or oral antibiotics, which can help to reduce inflammation and kill the bacteria that cause acne. In cases of severe, persistent acne, isotretinoin, a powerful oral medication, may be prescribed. Additionally, lifestyle modifications such as maintaining good hygiene practices, avoiding certain foods, and managing stress can also be helpful in managing acne.


### **Cleaning Acne images**


**Creating a dataframe with acne images from the data scrapped from DermNet**

In [None]:
# Labels representing acne in DermNet's scrapped data
acne_labels = list(image_df[image_df['skin_disorder_name'].str.contains('acne')]['skin_disorder_name'].unique())

# removing acne labels whose images will not be used because there are not clear
acne_labels.remove('infantile acne images')
acne_labels.remove('steroid acne images')

acne_labels

In [None]:
# There are six labels representing acne
len(acne_labels)

In [None]:
# Creating a dataframe with just acne labels for easier cleaning

acne_df = image_df[(image_df['skin_disorder_name'] == acne_labels[0]) | \
                   (image_df['skin_disorder_name'] == acne_labels[1]) | \
                   (image_df['skin_disorder_name'] == acne_labels[2]) | \
                   (image_df['skin_disorder_name'] == acne_labels[3]) 
                 ]
acne_df.info()

### **Extra acne images**

In [None]:
extra_acne = [image_name for image_name in os.listdir('extra_images/extra_acne_images')]
extra_acne[:5]

In [None]:
#Creating a dataframe for the extra acne images

label =['acne' for img in extra_acne]
extra_acne_df = pd.DataFrame(extra_acne, label).reset_index()
extra_acne_df.columns =['skin_disorder_name', 'images']
extra_acne_df.head()

**i. Moving acne images in the Images folder to their own folder**

In [None]:
# Getting the acne images file names
original_acne_img = [image_name for image_name in os.listdir('Images/') \
                     if ('acne affecting the back images' in image_name) |\
                        ('acne affecting the face images' in image_name) |\
                        ('acne and other follicular disorder images' in image_name) |\
                        ('facial acne images' in image_name) 
                        ] 

# Confirming the number of acne images before any cleaning
print('There are', len(original_acne_img),'acne images')
original_acne_img[:5]

In [None]:
# Creating a new folder with just acne images to make cleaning easier
folder_name = 'cleaned_images/acne_images/'



# Note📝: For reproducibility of the code, this step is important.
         # If the folder is not dropped before an error will occur if you rerun this cell
         
# Checking if the folder exists and deleting it if it exists        
if os.path.exists(folder_name):
    # deleting the folder and its contents
    shutil.rmtree(folder_name)

# create the new folder
os.mkdir(folder_name)

# Moving the images into that folder
for img in original_acne_img:
    origin = os.path.join('Images/', img)
    destination = os.path.join(folder_name, img)
    shutil.copy(origin, destination)

In [None]:
# Confirming that the number of acne images after moving them to a separate folder is still 702
acne_img = [image_name for image_name in os.listdir('cleaned_images/acne_images/')] 
print('There are', len(acne_img),'acne images.')

**ii. Dropping links from the 'images' column in the acne_df and replacing them with the image name**

In [None]:
# So that the two dataframes can match, we dropped the image links in  acne_df 
# and replaced them with the image names

acne_images = pd.DataFrame(acne_img, columns=['images'])
acne_df = acne_df.copy()
acne_df.drop('images', axis=1, inplace=True)
acne_df['images'] = acne_images['images'].values
acne_df.head()

**iii. Joining the two dataframes**

In [None]:
# Creating a dataframe with all of the acne images

acne_df_complete = pd.concat([acne_df, extra_acne_df], axis=0).reset_index()
acne_df_complete.drop('index', axis=1, inplace=True)
print(acne_df_complete.info())
acne_df_complete.head()

**iv. Combining the images into one folder**

In [None]:
# This was done by moving the extra images into the acne folder
for img in extra_acne:
    origin = os.path.join('extra_images/extra_acne_images/', img)
    destination = os.path.join('cleaned_images/acne_images/', img)
    shutil.copy(origin, destination)

In [None]:
# Confirming that the total acne images is 1427 before any cleaning

acne_img = [image_name for image_name in os.listdir('cleaned_images/acne_images/')] 
print('There are a total of', len(acne_img),'acne images.')

**v. Removing duplicated images from the folder**

In [3]:
# Function for removing duplicated images.
def drop_duplicated_images(folder):

    # Define a threshold for image similarity
    threshold = 8

    # Define a dictionary to store the hash values and file paths of the images
    image_hashes = {}
    duplicated_images = []

    # Loop through all the image files in a directory
    for filename in os.listdir(folder):
        # Load the image file
        image = Image.open(os.path.join(folder, filename))

         # Compute the hash value of the image using the average hash algorithm
        hash_value = imagehash.average_hash(image)

        # Check if the hash value is already in the dictionary
        if hash_value in image_hashes:
            # If a similar hash value already exists, delete the duplicate image
            duplicated_images.append(filename)
            os.remove(os.path.join(folder, filename))
        else:
             # Otherwise, add the hash value and file path to the dictionary
            image_hashes[hash_value] = os.path.join(folder, filename)
            
    return duplicated_images

In [None]:
# Dropping duplicates
duplicated_images = drop_duplicated_images('cleaned_images/acne_images/')
duplicated_images[:5]

In [None]:
acne_img = [image_name for image_name in os.listdir('cleaned_images/acne_images/')] 
print('There are', len(acne_img),'acne images after removing duplicated images')

In [None]:
# Getting the indexes of the duplicated images so that they can be dropped from the acne_df_complete too.

duplicated_indexes = [acne_df_complete[acne_df_complete['images'] == image_name].index[0] \
                      for image_name in acne_df_complete['images']\
                      if image_name in duplicated_images]
duplicated_indexes[:10]

In [None]:
# Dropping duplicated images from the dataframe.
acne_df_complete = acne_df_complete.copy()
acne_df_complete.drop(index=duplicated_indexes, inplace=True)
acne_df_complete.info()

Acne affecting the back images, Acne affecting the face images all have correct images. The Only change that will be made is changing the name to acne. </br>
Acne and other follicular disorder images has a collection of different images. Only images that have acne as a specific label will be included, the others will be dropped from the dataset. </br>
***
**Dealing with the collection of different images in Acne and other follicular disorder images**</br>

After careful evaluation of the images, the images that represent acne are:</br>
>>> **[299, 301, 302, 305, 306, 312, 324, 327, 331, 332, 334, 335, 336,</br>, 340, 352, 353, 356, 357, 358, 360, 365, 370, 379, 383, 386, 391, </br>, 394, 399, 400, 401, 404, 406, 407, 410, 410, 412, 414, 418, 435, </br> 439, 440, 442]**

In [None]:
# Note 📝: The indexes were confirmed to be the same even after merging the two dataframes
        #: This is because the acne_df is at the top in the complete dataframe

# indexes of the images in 'acne and other follicular disorder images'
indexes = acne_df[acne_df['skin_disorder_name'] == 'acne and other follicular disorder images'].index

# indexes of the acne images in 'acne and other follicular disorder images'
acne_indexes = [299, 301, 302, 305, 306, 312, 324, 327, 331, 332, 334, 335, 336,
                340, 352, 353, 356, 357, 358, 360, 365, 370, 379, 383, 386, 391,
                394, 399, 400, 401, 404, 406, 407, 410, 410, 412, 414, 418, 435,
                439, 440, 442]

# indexes of the other follicular disorder images in 'acne and other follicular disorder images'. This indexes will be dropped.
to_drop = []

for index in  indexes:
    if (index not in acne_indexes) and (index not in duplicated_indexes):
        to_drop.append(index)

# dropping indexes in to_drop
acne_df_complete.drop(to_drop, axis = 0, inplace=True)

In [None]:
# After dropping non-acne images, we still have 1000 images left
acne_df_complete.shape

In [None]:
# dropping those images from the acne_images folder

# Finding the image file names to be dropped from the folder
img_to_drop = []

for index in to_drop:
    for img_name in original_acne_img:
        if str(index) in img_name:
            img_to_drop.append(img_name)

# Dropping those images form the acne_images folder
for filename in img_to_drop:
    os.remove(os.path.join("cleaned_images/acne_images/", filename))

# Confirming that the number of images left is 1000
acne_img = [image_name for image_name in os.listdir('cleaned_images/acne_images/')] 
print('There are', len(acne_img),'acne images left.')

**vi. Changing the label to just acne**

In [None]:
acne_df_complete['skin_disorder_name'] = 'acne'
print(acne_df_complete.shape)
acne_df_complete.head()

**vii. Saving the acne_df_complete dataframe as a csv file**

In [None]:
acne_df_complete.to_csv('cleaned_data/acne.csv', index=False)

## **<u>Atopic dermatitis(Eczema) </u>**

**Meaning**<br>
Atopic dermatitis, also known as eczema, is a chronic inflammatory skin condition that is characterized by dry, itchy, and inflamed patches of skin. It is a common condition that can affect people of all ages, but it is most common in infants and children. <br>

**Causes**<br>
The exact causes of atopic dermatitis are not fully understood, but it is believed to be a combination of genetic and environmental factors. People with atopic dermatitis often have a genetic predisposition to the condition, and environmental triggers such as allergens, irritants, and stress can exacerbate the symptoms.

**Symptoms**<br>
The symptoms of atopic dermatitis can vary depending on the severity of the condition. Mild cases may only present with dry, itchy skin, while more severe cases can lead to red, inflamed, and weeping skin lesions. In some cases, the skin may become thickened and scaly. Atopic dermatitis can also cause significant discomfort and interfere with a person's quality of life.

**Treatment**<br>
Treatment options for eczema include using gentle soaps and moisturizers, avoiding harsh chemicals and irritants, and taking short, lukewarm baths or showers. Prescription creams or ointments containing corticosteroids or immunosuppressants may be used for more severe cases of eczema. Antihistamines can also be helpful in reducing itching. <br>
Preventing flare-ups of eczema can be done by avoiding triggers such as certain foods, allergens, and irritants. Regular use of moisturizers can also help to keep the skin hydrated and reduce the risk of flare-ups.<br>

### **Cleaning Eczema images**

**Creating a dataframe with eczema images from the data scrapped from DermNet**

In [None]:
# Labels representing eczema in Dermnet's scraped data.

eczema_labels = image_df[(image_df['skin_disorder_name'].str.contains('eczema')) | \
                         (image_df['skin_disorder_name'].str.contains('atopic dermatitis images')) |\
                         (image_df['skin_disorder_name'].str.contains('hand dermatitis images')) |\
                         (image_df['skin_disorder_name'] == 'dermatitis images') |\
                         (image_df['skin_disorder_name'].str.contains('nummular dermatitis images'))] \
                         ['skin_disorder_name'].unique()
len(eczema_labels)

In [None]:
# Creating a dataframe with just eczema labels for easier cleaning

eczema_df = image_df[(image_df['skin_disorder_name'] == eczema_labels[0]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[1]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[2]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[3]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[4]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[5]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[6]) | \
                     (image_df['skin_disorder_name'] == eczema_labels[7]) 
                 ]
eczema_df.info()

### **Extra eczema images**

In [None]:
extra_eczema = [image_name for image_name in os.listdir('extra_images/extra_eczema')]
extra_eczema[:5]

In [None]:
# The folder has a mixture of images. We will filter out the eczema images only

extra_eczema_images = [image_name for image_name in extra_eczema\
                        if ('dermatitis' in image_name) |\
                        ('eczema' in image_name)]
extra_eczema_images[:5]

In [None]:
# Moving this images into their own folder called 'extra_eczema_images_clean'
folder_name = 'extra_images/extra_eczema_images_clean/'


# Note📝: For reproducibility of the code, this step is important.
         # If the folder is not dropped before an error will occur if you rerun this cell
         
# Checking if the folder exists and deleting it if it exists        
if os.path.exists(folder_name):
    # deleting the folder and its contents
    shutil.rmtree(folder_name)

# create the new folder
os.mkdir(folder_name)

for img in extra_eczema_images:
    origin = os.path.join('extra_images/extra_eczema/', img)
    destination = os.path.join(folder_name, img)
    shutil.copy(origin, destination)

In [None]:
#Creating a dataframe for the extra eczema images

label =['eczema' for img in extra_eczema_images]
extra_eczema_df = pd.DataFrame(extra_eczema_images, label).reset_index()
extra_eczema_df.columns =['skin_disorder_name', 'images']
print(extra_eczema_df.shape)
extra_eczema_df.head()

**i. Moving eczema images in the Image folder to their own folder**

In [None]:
# Getting the eczema images file names
eczema_img = [image_name for image_name in os.listdir('Images/') if ('eczema' in image_name) |
                                                                    ('atopic dermatitis images' in image_name) |
                                                                    ('hand dermatitis images' in image_name) | 
                                                                    (image_name.startswith('dermatitis images'))|
                                                                    ('nummular dermatitis images' in image_name)
                                                                     ] 

# Confirming the number of eczema images before any cleaning
print('There are', len(eczema_img),'eczema images.')
eczema_img[:5]

In [None]:
# Creating a new folder with just eczema images to make cleaning easier
folder_name = 'cleaned_images/eczema_images/'

# Note📝: For reproducibility of the code, this step is important.
         # If the folder is not dropped before an error will occur if you rerun this cell
         
# Checking if the folder exists and deleting it if it exists         
if os.path.exists(folder_name):
    # deleting the folder and its contents
    shutil.rmtree(folder_name)

# create the new folder
os.mkdir(folder_name)

# Moving the images into that folder
for img in eczema_img:
    origin = os.path.join('Images/', img)
    destination = os.path.join(folder_name, img)
    shutil.copy(origin, destination)

In [None]:
# Confirming that the number of eczema images after moving them to a separate folder is still 631
eczema_img = [image_name for image_name in os.listdir('cleaned_images/eczema_images/')] 
print('There are', len(eczema_img),'eczema images.')

**ii. Dropping links from the 'images' column in the eczema_df and replacing them with the image name**

In [None]:
# So that the two dataframes can match, we dropped the image links in  eczema_df 
# and replaced them with the image names

eczema_images = pd.DataFrame(eczema_img, columns=['images'])
eczema_df = eczema_df.copy()
eczema_df.drop('images', axis=1, inplace=True)
eczema_df['images'] = eczema_images['images'].values
eczema_df.head()

**iii. Joining the two dataframes**

In [None]:
# Creating a dataframe with all of the eczema images

eczema_df_complete = pd.concat([eczema_df, extra_eczema_df], axis=0).reset_index()
eczema_df_complete.drop('index', axis=1, inplace=True)
print(eczema_df_complete.info())
eczema_df_complete.head()

**iv. Combining the images into one folder**

In [None]:
# This was done by moving the extra images into the eczema folder
for img in extra_eczema_images:
    origin = os.path.join('extra_images/extra_eczema_images_clean/', img)
    destination = os.path.join('cleaned_images/eczema_images/', img)
    shutil.copy(origin, destination)

In [None]:
# Confirming that the total acne images is 1367 before any cleaning

eczema_img = [image_name for image_name in os.listdir('cleaned_images/eczema_images/')] 
print('There are a total of', len(eczema_img),'eczema images.')

**v. Removing duplicated images from the folder**

In [None]:
# Using a function created earlier to drop duplicates

duplicated_images = drop_duplicated_images('cleaned_images/eczema_images/')
duplicated_images[:5]

In [None]:
# Confirming the number of images after dropping duplicates

eczema_img = [image_name for image_name in os.listdir('cleaned_images/eczema_images/')] 
print('There are', len(eczema_img),'eczema images after removing duplicated images.')

In [None]:
# Getting the indexes of the duplicated images so that they can be dropped from the eczema_df_complete too.

duplicated_indexes = [eczema_df_complete[eczema_df_complete['images'] == image_name].index[0] \
                      for image_name in eczema_df_complete['images']\
                      if image_name in duplicated_images]
duplicated_indexes[:10]

In [None]:
# Dropping duplicated images from the dataframe.
eczema_df_complete = eczema_df_complete .copy()
eczema_df_complete .drop(duplicated_indexes, axis=0, inplace=True)
eczema_df_complete .info()

**vi. Changing the label to just eczema**

In [None]:
eczema_df_complete['skin_disorder_name'] = 'eczema'
print(eczema_df_complete.shape)
eczema_df_complete.head()

**vii. Saving the acne_df_complete dataframe as a csv file**

In [None]:
eczema_df_complete.to_csv('cleaned_data/eczema.csv', index=False)

### Actinic keratosis

**Meaning** <br>
Actinic keratosis(AK) is a skin condition that is caused by long-term exposure to UV rays, resulting in the formation of rough, scaly patches on the skin. It is considered a precancerous condition because it has the potential to develop into squamous cell carcinoma, a type of skin cancer

**Causes** <br>
The primary cause of actinic keratosis is long-term exposure to UV rays from the sun or other sources such as tanning beds. People with fair skin, light-colored hair, and light-colored eyes are at a higher risk of developing AK. Other risk factors include a history of frequent sunburns, a weakened immune system, and exposure to chemicals such as coal tar or arsenic.

**Symptoms** <br>
The most common symptom of actinic keratosis is the formation of rough, scaly patches or lesions on the skin. These patches can be pink, red, or brown in color and may feel like sandpaper. They are usually found on areas of the skin that are frequently exposed to the sun, such as the face, scalp, ears, neck, hands, and arms. In some cases, the patches may itch or burn, and they may become inflamed or bleed if they are scratched or rubbed.

**Treatment** <br>
The treatment of actinic keratosis depends on the severity of the condition. Mild cases may be treated with topical creams or gels that contain medications such as imiquimod, fluorouracil, or diclofenac. These medications work by stimulating the immune system or causing the abnormal cells to die off. In more severe cases, cryotherapy (freezing the lesions with liquid nitrogen) or curettage (scraping off the lesions with a special tool) may be necessary. In rare cases where the lesions have developed into skin cancer, surgical removal may be required. It is also important to take steps to prevent further damage to the skin, such as wearing protective clothing and sunscreen, avoiding tanning beds, and staying out of the sun during peak hours.


In [4]:
# image labels with the name keratosis
print(image_df[image_df['skin_disorder_name'].str.contains('keratosis')]['skin_disorder_name'].unique())

['actinic keratosis affecting the face images'
 'actinic keratosis affecting the hand images'
 'actinic keratosis affecting the legs and feet images'
 'actinic keratosis affecting the scalp images'
 'actinic keratosis dermoscopy images'
 'actinic keratosis on the nose images'
 'actinic keratosis treated with imiquimod images'
 'granular parakeratosis images' 'keratosis pilaris images'
 'seborrhoeic keratosis dermoscopy images' 'seborrhoeic keratosis images'
 'solar keratosis affecting the face images'
 'solar keratosis affecting the hand images'
 'solar keratosis affecting the legs and feet images'
 'solar keratosis affecting the scalp images'
 'solar keratosis on the nose images'
 'solar keratosis treated with imiquimod images']


Actinic keratosis is also known as solar keratosis or senile keratosis

In [5]:
# dataframe with actinic keratosis and solar keratosis labels
keratosis_df = image_df[(image_df['skin_disorder_name'].str.contains('actinic keratosis')) | \
                  (image_df['skin_disorder_name'].str.contains('solar keratosis'))]
print(keratosis_df.shape)
keratosis_df.head(2)

(427, 2)


Unnamed: 0,skin_disorder_name,images
504,actinic keratosis affecting the face images,https://dermnetnz.org/assets/Uploads/lesions/a...
505,actinic keratosis affecting the face images,https://dermnetnz.org/assets/Uploads/lesions/a...


In [6]:
# extra keratosis dataframes
#first dataframe
df = pd.read_csv('Data/ISIC_2019_Training_GroundTruth.csv')

# filter df to get rows where AK = 1.0
df1 = df.copy()
df1 = df1[df1['AK'] == 1.0]
df1['skin_disorder_name'] = df1['images']

# drop the unwanted columns from df
df1 = df1.drop(['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC', 'UNK'], axis=1)

# Loop through each file in the folder and add the skin disorder name to the list
img_names = []
image_paths = []

for file in os.listdir('extra_images/extra_actinic_keratosis_images'):
    if file.endswith(".jpg"):
        skin_disorder_name = file.split(".")[0]
        img_names.append(skin_disorder_name)
        image_paths.append(file)

# Create a second Pandas DataFrame with the list of skin disorder names
df2 = pd.DataFrame({"skin_disorder_name": img_names, "images": image_paths})

# Merge the dataframes
AK_df = pd.concat([keratosis_df, df1, df2], axis=0)

# display the merged dataframe
print(AK_df.shape)
AK_df.head(4)

(1391, 2)


Unnamed: 0,skin_disorder_name,images
504,actinic keratosis affecting the face images,https://dermnetnz.org/assets/Uploads/lesions/a...
505,actinic keratosis affecting the face images,https://dermnetnz.org/assets/Uploads/lesions/a...
506,actinic keratosis affecting the face images,https://dermnetnz.org/assets/Uploads/lesions/a...
507,actinic keratosis affecting the face images,https://dermnetnz.org/assets/Uploads/lesions/a...


In [7]:
# Getting the keratosis images file names
keratosis_img = [image_name for image_name in os.listdir('Images/') if ('actinic keratosis' in image_name) | ('solar keratosis' in image_name)]
AK_img = [image_name for image_name in os.listdir('extra_images/extra_AK_and_BKL_images') if any(x in image_name for x in df1['images'].tolist())]
AK_img2 = [image_name for image_name in os.listdir('extra_images/extra_actinic_keratosis_images')]

# Checking if the folder exists and deleting it if it exists
if os.path.exists('cleaned_images/keratosis_images/'):
    # deleting the folder and its contents
    shutil.rmtree('cleaned_images/keratosis_images/')
    
# Creating a new folder with just keratosis images to make cleaning easier
os.mkdir('cleaned_images/keratosis_images/')
for img in keratosis_img:
    origin = os.path.join('Images/', img)
    destination = os.path.join('cleaned_images/keratosis_images/', img)
    shutil.copy(origin, destination)

for img in AK_img:
    origin = os.path.join('extra_images/extra_AK_and_BKL_images/', img)
    destination = os.path.join('cleaned_images/keratosis_images/', img)
    shutil.copy(origin, destination)

for img in AK_img2:
    origin = os.path.join('extra_images/extra_actinic_keratosis_images/', img)
    destination = os.path.join('cleaned_images/keratosis_images/', img)
    shutil.copy(origin, destination)
    
# Confirming that the number of keratosis images after moving them to a separate folder is still 1294
keratosis_img = [image_name for image_name in os.listdir('cleaned_images/keratosis_images/')] 
print('There are', len(keratosis_img),'actinic keratosis images')

There are 1391 actinic keratosis images


In [8]:
# call function to drop duplicates from image folder 
duplicated_images = drop_duplicated_images('cleaned_images/keratosis_images/')
duplicated_images[184:188]

['ISIC_0072940.jpg',
 'ISIC_0073068.jpg',
 'ISIC_0073157.jpg',
 'ISIC_0073198.jpg']

In [9]:
# number of images after removing duplicates
keratosis_img = [image_name for image_name in os.listdir('cleaned_images/keratosis_images/')] 
print('Number of ctinic keratosis images after removing duplicated images:', len(keratosis_img))

# Remove duplicate images from the dataframe
# for the first keratosis_df, the image name on folder has index added to skin_disorder_name 
mask = AK_df['skin_disorder_name'].str.contains('actinic keratosis') | AK_df['skin_disorder_name'].str.contains('solar keratosis')
AK_df.loc[mask, 'skin_disorder_name'] = AK_df.loc[mask, 'skin_disorder_name'] + AK_df.loc[mask].index.astype(str)
AK_df['skin_disorder_name'] = AK_df['skin_disorder_name'].apply(lambda x: x + '.jpg')
duplicated_df = AK_df[AK_df['skin_disorder_name'].isin(duplicated_images)]
merged_df = AK_df.merge(duplicated_df, on="skin_disorder_name", how="outer", indicator=True)
AK_df = merged_df.loc[merged_df["_merge"]=="left_only"].drop_duplicates(subset=["skin_disorder_name"]).drop(columns=['images_y', '_merge'])
print(f'Shape of Actinic keratosis dataframe{AK_df.shape}')

Number of ctinic keratosis images after removing duplicated images: 1000
Shape of Actinic keratosis dataframe(1000, 2)


The images and the dataframe have the same number of rows, 1000, on removing duplicates

In [10]:
# rename the values of skin_disorder_name column to actinic keratosis
AK_df['skin_disorder_name'] = AK_df['skin_disorder_name'].replace(AK_df['skin_disorder_name'].unique(), 'actinic keratosis')
AK_df = AK_df.rename(columns={'images_x': 'images'})
AK_df.head(4)

Unnamed: 0,skin_disorder_name,images
0,actinic keratosis,https://dermnetnz.org/assets/Uploads/lesions/a...
1,actinic keratosis,https://dermnetnz.org/assets/Uploads/lesions/a...
2,actinic keratosis,https://dermnetnz.org/assets/Uploads/lesions/a...
3,actinic keratosis,https://dermnetnz.org/assets/Uploads/lesions/a...


In [11]:
# save BKL dataframe to csv file
AK_df.to_csv('cleaned_data/AK.csv', index=False)

## Benign Keratosis-like Lesions

**Meaning** <br>
Benign Keratosis-like Lesions (BKL) are a group of benign skin lesions that resemble actinic keratosis (AK) but are not classified as AK because they do not have the same degree of dysplasia. BKL lesions can appear as small, scaly, or waxy bumps on the skin, ranging in color from light tan to dark brown. They typically occur on areas of the skin that have been exposed to the sun, such as the face, neck, scalp, and hands. Examples of BKL lesions include seborrheic keratosis, solar lentigo, and lichen planus-like keratosis.
 
**Causes** <br>
The exact cause of BKL is not known, but it is believed to be related to long-term sun exposure. Other factors that may contribute to the development of BKL include a weakened immune system, age, and a history of other skin conditions.

**Symptoms**
BKL lesions typically appear as small, scaly, or waxy bumps on the skin. They may be light tan to dark brown in color and may have a rough, textured surface. They can be single or multiple and can occur on any part of the body, but are most commonly found on the face, neck, scalp, and hands.

**Treatment**
BKL lesions are usually benign and do not require treatment unless they are causing symptoms or affecting the patient's appearance. Treatment options may include cryotherapy (freezing the lesion with liquid nitrogen), curettage (scraping the lesion off the skin), or topical medications such as 5-fluorouracil or imiquimod. In some cases, BKL lesions may be biopsied to confirm the diagnosis or rule out other skin conditions. It is important to protect the skin from sun exposure and to seek medical attention for any suspicious skin lesions.

In [12]:
# filter df to get rows where BKL = 1.0
BKL_df = df.copy()
BKL_df = BKL_df[BKL_df['BKL'] == 1.0]
BKL_df = BKL_df[~BKL_df["images"].str.contains("downsampled")]
BKL_df['skin_disorder_name'] = BKL_df['images']

# drop the unwanted columns and rows from df
BKL_df = BKL_df.drop(['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC', 'UNK'], axis=1)
BKL_df = BKL_df[:1003]

# display the dataframe
print(BKL_df.shape)
BKL_df.head(3)

(1003, 2)


Unnamed: 0,images,skin_disorder_name
1008,ISIC_0010491,ISIC_0010491
1544,ISIC_0012811,ISIC_0012811
1619,ISIC_0012998,ISIC_0012998


In [13]:
# Getting the BKL images file names
BKL_img = [image_name for image_name in os.listdir('extra_images/extra_AK_and_BKL_images') if any(x in image_name for x in BKL_df['images'].tolist())]

# Checking if the folder exists and deleting it if it exists
if os.path.exists('cleaned_images/BKL_images/'):
    # deleting the folder and its contents
    shutil.rmtree('cleaned_images/BKL_images/')
    
# Creating a new folder with just BKL images to make cleaning easier
os.mkdir('cleaned_images/BKL_images/')
for img in BKL_img:
    origin = os.path.join('extra_images/extra_AK_and_BKL_images/', img)
    destination = os.path.join('cleaned_images/BKL_images/', img)
    shutil.copy(origin, destination)
    
# Confirming that the number of BKL images after moving them to a separate folder is still 1000
BKL_img = [image_name for image_name in os.listdir('cleaned_images/BKL_images/')] 
print('There are', len(BKL_img),'BKL images')

There are 1003 BKL images


In [14]:
# drop duplicates from image folder
duplicated_images2 = drop_duplicated_images('cleaned_images/BKL_images/')
duplicated_images2

['ISIC_0027218.jpg', 'ISIC_0031511.jpg', 'ISIC_0032315.jpg']

In [15]:
# number of images after removing duplicates
BKL_img = [image_name for image_name in os.listdir('cleaned_images/BKL_images/')] 
print('Number of BKL images after removing duplicated images:', len(BKL_img))

#Remove duplicate images from the dataframe
BKL_df['skin_disorder_name'] = BKL_df['skin_disorder_name'].apply(lambda x: x + '.jpg')
duplicated_df = BKL_df[BKL_df['skin_disorder_name'].isin(duplicated_images2)]
merged_df = BKL_df.merge(duplicated_df, on="skin_disorder_name", how="outer", indicator=True)
BKL_df = merged_df.loc[merged_df["_merge"]=="left_only"].drop_duplicates(subset=["skin_disorder_name"]).drop(columns=['images_y', '_merge'])
print(f'Shape of BKL dataframe: {BKL_df.shape}')

Number of BKL images after removing duplicated images: 1000
Shape of BKL dataframe: (1000, 2)


In [16]:
# rename the values of skin_disorder_name column to Benign Keratosis-like Lesions
BKL_df['skin_disorder_name'] = BKL_df['skin_disorder_name'].replace(BKL_df['skin_disorder_name'].unique(), 'Benign Keratosis-like Lesions')
BKL_df = BKL_df.rename(columns={'images_x': 'images'})
BKL_df.head(4)

Unnamed: 0,images,skin_disorder_name
0,ISIC_0010491,Benign Keratosis-like Lesions
1,ISIC_0012811,Benign Keratosis-like Lesions
2,ISIC_0012998,Benign Keratosis-like Lesions
3,ISIC_0024312,Benign Keratosis-like Lesions


In [17]:
# save BKL dataframe to csv file
BKL_df.to_csv('cleaned_data/BKL.csv', index=False)