# Run this jupyter notebook step by step (this note book is for ISIC skin cancer)

first, create conda environment with containing Python using following command:
```
in terminal:
conda create -n CDEP python
```
Enter 'y' in install packages and then enter the created enviroment using this command:
```
conda activate CDEP
```
then run the following commands to install ipykernel ,so you can specify in the jupyter notebook to use CDEP enviroment as kernel:
```
pip install ipykernel
python -m ipykernel install --user --name CDEP --display-name "CDEP"
```
then in this jupyter notebook in kernel click change kernel and select CDEP

# install the following packages in the terminal
first, enter the CDEP environment using the following command:
```
conda activate CDEP
```
and run the following commands to install the required libraries (run one by one):
```
pip install requests
pip install tqdm
conda install -c conda-forge ipywidgets
pip install jupyter_contrib_nbextensions
conda install pandas
```

# then run the following code to import libraries:

In [74]:
%load_ext autoreload

%autoreload 2
from isic_api import ISICApi
import os
import json
import csv
from tqdm.autonotebook import tqdm
with open('config.json') as json_file:
    data = json.load(json_file)

import pandas as pd
import numpy as np
import requests
from pathlib import Path

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Run this code to get the data meta data related to first 1000 images of skin data:
At the end of this code you should have imageList contating the metadatas of 1007 malignant images and 2367 benign images.
after running this you should see meta_random.csv file in following directory: data/ISIC/

In [66]:
def count_malignant(imageList):
    malignant = 0
    for data in imageList:
        if 'metadata' in data and 'clinical' in data['metadata'] and 'benign_malignant' in data['metadata']['clinical']:
            if data['metadata']['clinical']['benign_malignant'] == 'malignant':
                malignant+=1
    return malignant

api = ISICApi(username=None, password=None)
data_path = data["data_folder"]
# num_imgs = data["num_imgs"]

if not os.path.exists(data_path):
    os.makedirs(data_path)
imageList = []
count = 0
malignants  = 0
while malignants < 1000:
    if count == 0:
        temp = api.getJson('images/?limit=' + str(num_imgs) +'&offset=0&sort=name')
        imageList = temp['results']
        next_page = temp['next'].split("images")[1]
        malignants += count_malignant(imageList)
    else:
        temp = api.getJson('images' + next_page)
        imageList = imageList + temp['results']
        next_page = temp['next'].split("images")[1]
        malignants += count_malignant(temp['results'])
    print(f"number of malignant images that fetched from API: {malignants}")
    count+=1
    
# Determine the union of all image metadata fields
metadataFields = set(
        field
        for imageL in imageList
        for field in imageL['metadata']['clinical'].keys()
    )


metadataFields = ['isic_id'] + sorted(metadataFields)
outputFileName = "meta_random"
#%%
outputFilePath = os.path.join(data_path, outputFileName)
# Write the metadata to a CSV
print('Writing metadata to CSV: %s' % outputFileName+'.csv')
with open(outputFilePath+'.csv', 'w') as outputStream:
    csvWriter = csv.DictWriter(outputStream, metadataFields)
    csvWriter.writeheader()
    for imageL in imageList:
        rowDict = imageL['metadata']['clinical'].copy()
        rowDict['isic_id'] = imageL['isic_id']
        csvWriter.writerow(rowDict)
df = pd.read_csv('../../data/ISIC/meta_random.csv')
print(df.benign_malignant.value_counts())
del df


number of malignant images that fetched from API: 7
number of malignant images that fetched from API: 12
number of malignant images that fetched from API: 12
number of malignant images that fetched from API: 12
number of malignant images that fetched from API: 17
number of malignant images that fetched from API: 27
number of malignant images that fetched from API: 36
number of malignant images that fetched from API: 74
number of malignant images that fetched from API: 113
number of malignant images that fetched from API: 180
number of malignant images that fetched from API: 247
number of malignant images that fetched from API: 307
number of malignant images that fetched from API: 336
number of malignant images that fetched from API: 401
number of malignant images that fetched from API: 459
number of malignant images that fetched from API: 521
number of malignant images that fetched from API: 555
number of malignant images that fetched from API: 576
number of malignant images that fetch

# Now in the following code you want to download meta data of images that contain patches:
(I only download 2000 images with patches)
it take a little time to fetch all meta data

In [54]:
api = ISICApi(username=None, password=None)
print('Fetching metadata for noisy images')
imageDetails_noisy = []
with open('./image_ids_patches.txt', 'r') as file:
    lines = file.readlines()

for line in tqdm(lines[:2000]):
    temp = api.getJson('images/'+line.strip().split(".")[0]+"/")
    imageDetail = api.getJson('images/%s' % temp['isic_id'])
    imageDetails_noisy.append(imageDetail)

Fetching metadata for noisy images


  0%|          | 0/2000 [00:00<?, ?it/s]

now save results to /data/ISIC/meta_patches.csv:

In [57]:
metadataFields = set(
        field
        for imageDetail in imageDetails_noisy
        for field in imageDetail['metadata']['clinical'].keys()
    )


metadataFields = ['isic_id'] + sorted(metadataFields)
outputFileName = "meta_patches"
#%%
outputFilePath = os.path.join(data_path, outputFileName)
# Write the metadata to a CSV
print('Writing metadata to CSV: %s' % outputFileName+'.csv')
with open(outputFilePath+'.csv', 'w') as outputStream:
    csvWriter = csv.DictWriter(outputStream, metadataFields)
    csvWriter.writeheader()
    for imageDetail in imageDetails_noisy:
        rowDict = imageDetail['metadata']['clinical'].copy()
        rowDict['isic_id'] = imageDetail['isic_id']
        csvWriter.writerow(rowDict)

Writing metadata to CSV: meta_patches.csv


# merge meta_patches.csv and meta_random.csv to single file --> meta.csv

In [59]:
import pandas as pd

file1 = '../../data/ISIC/meta_random.csv'
file2 = '../../data/ISIC/meta_patches.csv'
output_file = '../../data/ISIC/meta.csv'
index_column = 'isic_id'
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

if index_column not in df1.columns or index_column not in df2.columns:
    raise ValueError(f"Column '{index_column}' must exist in both CSV files")
df2_filtered = df2[~df2[index_column].isin(df1[index_column])]
merged_df = pd.concat([df1, df2_filtered], ignore_index=True)
merged_df.to_csv(output_file, index=False)
merged_df.to_csv(output_file, index=False)

print(f"Merged file saved as {output_file}")

Merged file saved as ../../data/ISIC/meta.csv


the final files that we should download in rest of codes is like this: (4367 benign and 1007 malignant)

In [61]:
df = pd.read_csv('../../data/ISIC/meta.csv')
print(df.benign_malignant.value_counts())
del df

benign_malignant
benign                     4367
malignant                  1007
indeterminate/benign         35
indeterminate/malignant      27
indeterminate                20
Name: count, dtype: int64


# download images in meta.csv and save those in /data/ISIC/raw:
it also take a lot of time (but wait until the end)

In [75]:
imageDetails = imageList + imageDetails_noisy

api = ISICApi(username=None, password=None)
data_path = data["data_folder"]
savePath = os.path.join(data_path, 'raw')

if not os.path.exists(savePath):
    os.makedirs(savePath)

for detail in tqdm(imageDetails):
    url = detail['files']['full']['url']
    img_data = requests.get(url).content
    imageFileOutputPath = os.path.join(savePath, '%s.jpg' % detail['isic_id'])
    if not(Path(imageFileOutputPath).exists()):
        with open(imageFileOutputPath, 'wb') as imageFileOutputStream:
            imageFileOutputStream.write(img_data)


  0%|          | 0/5500 [00:00<?, ?it/s]

# run this for downloading the masks
theese images show the patches in images using mask. after running the following cell u can look at them at /Data/ISIC/segmentation