In [None]:
import os
import pandas as pd
import shutil
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit


In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:


# load the meta data
df = pd.read_csv('/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/HAM10000_metadata.csv')

# split the images into training and testing using StratifiedShuffleSplit
# this insures an equally stratified distribution of the classes between training and testing




# Create a StratifiedShuffleSplit object
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Split the data into training and testing sets while maintaining the class distribution
for train_index, test_index in splitter.split(df['image_id'], df['dx']):
    X_train = df['image_id'].iloc[train_index]
    X_test = df['image_id'].iloc[test_index]
    y_train = df['dx'].iloc[train_index].to_numpy()
    y_test = df['dx'].iloc[test_index].to_numpy()



training_df=pd.DataFrame( {'image_id' : X_train,
                          'dx'        : y_train}
                         )

training_df.shape

(8012, 2)

In [None]:
training_df.head(5)

In [None]:
testing_df=pd.DataFrame( {'image_id' : X_test,
                          'dx'        : y_test}
                         )

testing_df.shape

(2003, 2)

In [None]:
testing_df.head(5)

In [None]:
def organize_images(df, source_directory, output_directory):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Iterate through unique values in the 'dx' column (type of skin lesion)
    unique_dx_values = df['dx'].unique()
    for dx_value in unique_dx_values:
        dx_directory = os.path.join(output_directory, str(dx_value+"/"))

        # Create subdirectories for each dx type if they don't exist
        if not os.path.exists(dx_directory):
            os.makedirs(dx_directory)

        # Filter the meta data df to get rows with the current dx type
        filtered_df = df[df['dx'] == dx_value]

        # Iterate through the filtered DataFrame and copy the corresponding images
        for _, row in filtered_df.iterrows():
            image_name = row['image_id']
            source_image_path = os.path.join(source_directory, f"{image_name}.jpg")
            destination_image_path = os.path.join(dx_directory, f"{image_name}.jpg")  # Destination path


            print('------------------------------------\n')
            print(f'Source path : {source_image_path}')
            print(f'Destination path : {destination_image_path}')
            print(f'{image_name} Loaded.')

            # Copy the image to the appropriate subdirectory
            shutil.copy(source_image_path, destination_image_path)


In [None]:
# creat traning dir

source_dir = '/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/images/'     # Replace with the actual path to your image directory
output_dir = '/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/training/'  # Replace with the desired output directory

organize_images(training_df, source_dir, output_dir)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
------------------------------------

Source path : /content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/images/ISIC_0033977.jpg
Destination path : /content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/training/mel/ISIC_0033977.jpg
ISIC_0033977 Loaded.
------------------------------------

Source path : /content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/images/ISIC_0032592.jpg
Destination path : /content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/training/mel/ISIC_0032592.jpg
ISIC_0032592 Loaded.
------------------------------------

Source path : /content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/images/ISIC_0033520.jpg
Destination path : /content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/training/mel/ISIC_0033520.jpg
ISIC_0033520 Loaded.
------------------------------------

Source

In [None]:
# creat testing dir

source_dir = '/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/images'     # Replace with the actual path to your image directory
output_dir = '/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/testining'  # Replace with the desired output directory

organize_images(testing_df, source_dir, output_dir)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
------------------------------------

Source path : /content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/images/ISIC_0030839.jpg
Destination path : /content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/testining/nv/ISIC_0030839.jpg
ISIC_0030839 Loaded.
------------------------------------

Source path : /content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/images/ISIC_0033330.jpg
Destination path : /content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/testining/nv/ISIC_0033330.jpg
ISIC_0033330 Loaded.
------------------------------------

Source path : /content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/images/ISIC_0033329.jpg
Destination path : /content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/testining/nv/ISIC_0033329.jpg
ISIC_0033329 Loaded.
------------------------------------

Source

In [None]:
for dirpath, dirnames, filenames in os.walk("/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images"):
  print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

There are 3 directories and 0 images in '/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images'.
There are 7 directories and 0 images in '/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/training'.
There are 0 directories and 5364 images in '/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/training/nv'.
There are 0 directories and 262 images in '/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/training/akiec'.
There are 0 directories and 879 images in '/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/training/bkl'.
There are 0 directories and 411 images in '/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/training/bcc'.
There are 0 directories and 890 images in '/content/drive/MyDrive/Portfolio/Skin Cancer Classification, Ham10000/dir_images/training/mel'.
There are 0 directories and 114 images in '/co