<a href="https://colab.research.google.com/github/DermaScan-Bangkit-2024-CapstoneProject/DermaScan-Machine-Learning/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!cat ~/.kaggle/kaggle.json

In [5]:
# !kaggle datasets download kmader/skin-cancer-mnist-ham10000 ## ini sama dengan yang di bawah, pake yang di bawah aja
!kaggle datasets download pacificrm/skindiseasedataset # ini sudah dibagi train-test
!kaggle datasets download yashhvyass/resizeimage224224-skin-cancer-detection # lihat tabel di bawah
!kaggle datasets download surajghuwalewala/ham1000-segmentation-and-classification # lihat tabel di bawah
!kaggle datasets download kylegraupe/skin-cancer-binary-classification-dataset # sudah diklasifikasi; sudah dibagi train-test

Dataset URL: https://www.kaggle.com/datasets/pacificrm/skindiseasedataset
License(s): CC0-1.0
Downloading skindiseasedataset.zip to /content
 99% 1.35G/1.36G [00:22<00:00, 72.2MB/s]
100% 1.36G/1.36G [00:22<00:00, 65.9MB/s]
Dataset URL: https://www.kaggle.com/datasets/yashhvyass/resizeimage224224-skin-cancer-detection
License(s): apache-2.0
Downloading resizeimage224224-skin-cancer-detection.zip to /content
 98% 335M/342M [00:05<00:00, 87.2MB/s]
100% 342M/342M [00:05<00:00, 64.6MB/s]
Dataset URL: https://www.kaggle.com/datasets/surajghuwalewala/ham1000-segmentation-and-classification
License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Downloading ham1000-segmentation-and-classification.zip to /content
100% 2.59G/2.59G [00:33<00:00, 106MB/s] 
100% 2.59G/2.59G [00:33<00:00, 82.6MB/s]
Dataset URL: https://www.kaggle.com/datasets/kylegraupe/skin-cancer-binary-classification-dataset
License(s): CC0-1.0
Downloading skin-cancer-binary-classification-dataset.zip to /content
 

### **Consideration Table**
| Condition                  | Cancer Association       | Type                     |
|----------------------------|--------------------------|--------------------------|
| Actinic Keratosis          | Precancerous             | Potentially cancerous    |
| Basal Cell Carcinoma       | Cancerous                | Cancerous                |
| Bowen’s Disease            | Early-stage cancer       | Cancerous                |
| Melanoma                   | Cancerous                | Cancerous                |
| Skin Cancer (General)      | Cancerous                | Cancerous                |
| Moles                      | Generally benign         | Generally benign         |
| Sun/Sunlight Damage        | Indirectly linked        | Risk factor for cancer   |
| Benign Keratosis-like Lesions | Benign               | Non-cancerous            |
| Benign Tumors              | Benign                   | Non-cancerous            |
| Seborrheic Keratoses       | Benign                   | Non-cancerous            |
| Vascular Tumors            | Mostly benign            | Mostly non-cancerous     |
| Others (Acne, Eczema, etc.)| Benign                   | Non-cancerous            |


In [6]:
import zipfile
import os

path = "/content"

for file_name in os.listdir(path):
    if file_name.endswith('.zip'):
        folder_name = os.path.join(path, file_name[:-4])
        os.makedirs(folder_name, exist_ok=True)

        file_path = os.path.join(path, file_name)
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(folder_name)

        print(f"Extracted {file_name} into folder: {folder_name}")


Extracted skin-cancer-binary-classification-dataset.zip into folder: /content/skin-cancer-binary-classification-dataset
Extracted resizeimage224224-skin-cancer-detection.zip into folder: /content/resizeimage224224-skin-cancer-detection
Extracted ham1000-segmentation-and-classification.zip into folder: /content/ham1000-segmentation-and-classification
Extracted skindiseasedataset.zip into folder: /content/skindiseasedataset


In [12]:
from collections import defaultdict
import hashlib
import os
import sys


def chunk_reader(fobj, chunk_size=1024):
    """Generator that reads a file in chunks of bytes"""
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk


def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
    hashobj = hash()
    file_object = open(filename, 'rb')

    if first_chunk_only:
        hashobj.update(file_object.read(1024))
    else:
        for chunk in chunk_reader(file_object):
            hashobj.update(chunk)
    hashed = hashobj.digest()

    file_object.close()
    return hashed


def check_for_duplicates(paths, hash=hashlib.sha1):
    hashes_by_size = defaultdict(list)  # dict of size_in_bytes: [full_path_to_file1, full_path_to_file2, ]
    hashes_on_1k = defaultdict(list)  # dict of (hash1k, size_in_bytes): [full_path_to_file1, full_path_to_file2, ]
    hashes_full = {}   # dict of full_file_hash: full_path_to_file_string

    for path in paths:
        for dirpath, dirnames, filenames in os.walk(path):
            # get all files that have the same size - they are the collision candidates
            for filename in filenames:
                full_path = os.path.join(dirpath, filename)
                try:
                    # if the target is a symlink (soft one), this will
                    # dereference it - change the value to the actual target file
                    full_path = os.path.realpath(full_path)
                    print(full_path)
                    file_size = os.path.getsize(full_path)
                    hashes_by_size[file_size].append(full_path)
                except (OSError,):
                    # not accessible (permissions, etc) - pass on
                    continue

    # For all files with the same file size, get their hash on the 1st 1024 bytes only
    for size_in_bytes, files in hashes_by_size.items():
        if len(files) < 2:
            continue    # this file size is unique, no need to spend CPU cycles on it

        for filename in files:
            try:
                small_hash = get_hash(filename, first_chunk_only=True)
                # the key is the hash on the first 1024 bytes plus the size - to
                # avoid collisions on equal hashes in the first part of the file
                # credits to @Futal for the optimization
                hashes_on_1k[(small_hash, size_in_bytes)].append(filename)
            except (OSError,):
                # the file access might've changed till the exec point got here
                continue

    # For all files with the hash on the 1st 1024 bytes, get their hash on the full file - collisions will be duplicates
    for __, files_list in hashes_on_1k.items():
        if len(files_list) < 2:
            continue    # this hash of fist 1k file bytes is unique, no need to spend cpy cycles on it

        for filename in files_list:
            try:
                full_hash = get_hash(filename, first_chunk_only=False)
                duplicate = hashes_full.get(full_hash)
                if duplicate:
                    print("Duplicate found: {} and {}".format(filename, duplicate))
                else:
                    hashes_full[full_hash] = filename
            except (OSError,):
                # the file access might've changed till the exec point got here
                continue

check_for_duplicates(["/content/"])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Duplicate found: /content/resizeimage224224-skin-cancer-detection/test224/ISIC_2251480.jpg and /content/resizeimage224224-skin-cancer-detection/train224/ISIC_2251480.jpg
Duplicate found: /content/resizeimage224224-skin-cancer-detection/test224/ISIC_6595668.jpg and /content/resizeimage224224-skin-cancer-detection/train224/ISIC_6595668.jpg
Duplicate found: /content/resizeimage224224-skin-cancer-detection/test224/ISIC_3854989.jpg and /content/resizeimage224224-skin-cancer-detection/train224/ISIC_3854989.jpg
Duplicate found: /content/resizeimage224224-skin-cancer-detection/test224/ISIC_1162576.jpg and /content/resizeimage224224-skin-cancer-detection/train224/ISIC_1162576.jpg
Duplicate found: /content/resizeimage224224-skin-cancer-detection/test224/ISIC_7403913.jpg and /content/resizeimage224224-skin-cancer-detection/train224/ISIC_7403913.jpg
Duplicate found: /content/resizeimage224224-skin-cancer-detection/test224/ISIC_927180

In [18]:
!find "/content/ham1000-segmentation-and-classification/images" -type f | wc -l
!find "/content/skin-cancer-binary-classification-dataset" -type f | wc -l
!find "resizeimage224224-skin-cancer-detection" -type f | wc -l
!find "skindiseasedataset" -type f | wc -l

10017
288
66252
15444
