In [1]:
import cv2
import random
import numpy as np
import matplotlib.pyplot as plt
import polars as pl
from pathlib import Path
from utils import init_dataframe, Pipeline
import segmentation
import feature_engineering
import logging
logging.basicConfig(filename='log.txt',
                    format='%(asctime)s,%(msecs)03d-%(name)s-%(levelname)s: %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.DEBUG)

logger = logging.getLogger(__name__)


In [2]:
# Set random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Set dataset path
TRAIN_DATA = Path("data/ISIC2018_Task3_Training_Input")
TRAIN_LABELS = Path("data/ISIC2018_Task3_Training_GroundTruth/ISIC2018_Task3_Training_GroundTruth.csv")

VALID_DATA = Path("data/ISIC2018_Task3_Validation_Input")
VALID_LABELS = Path("data/ISIC2018_Task3_Validation_GroundTruth/ISIC2018_Task3_Validation_GroundTruth.csv")

TEST_DATA = Path("data/ISIC2018_Task3_Test_Input")
TEST_LABELS = Path("data/ISIC2018_Task3_Test_GroundTruth/ISIC2018_Task3_Test_GroundTruth.csv")

PROCESSED_IMAGES_PATH = Path("data/processed/")


In [3]:
train_df = init_dataframe(TRAIN_DATA, TRAIN_LABELS)
valid_df = init_dataframe(VALID_DATA, VALID_LABELS)
test_df = init_dataframe(TEST_DATA, TEST_LABELS)
pipeline = Pipeline(train_df=train_df, valid_df=valid_df, test_df=test_df, output_path=PROCESSED_IMAGES_PATH)

pipeline.print_df()


Train DataFrame shape: (10015, 10)
Columns: ['sample_id', 'image', 'image_path', 'MEL', 'NV', 'BCC', 'AKIEC', 'BKL', 'DF', 'VASC']
shape: (5, 10)
┌───────────┬──────────────┬─────────────────────────────────┬─────┬───┬───────┬─────┬─────┬──────┐
│ sample_id ┆ image        ┆ image_path                      ┆ MEL ┆ … ┆ AKIEC ┆ BKL ┆ DF  ┆ VASC │
│ ---       ┆ ---          ┆ ---                             ┆ --- ┆   ┆ ---   ┆ --- ┆ --- ┆ ---  │
│ i64       ┆ str          ┆ str                             ┆ f64 ┆   ┆ f64   ┆ f64 ┆ f64 ┆ f64  │
╞═══════════╪══════════════╪═════════════════════════════════╪═════╪═══╪═══════╪═════╪═════╪══════╡
│ 0         ┆ ISIC_0024306 ┆ data\ISIC2018_Task3_Training_I… ┆ 0.0 ┆ … ┆ 0.0   ┆ 0.0 ┆ 0.0 ┆ 0.0  │
│ 1         ┆ ISIC_0024307 ┆ data\ISIC2018_Task3_Training_I… ┆ 0.0 ┆ … ┆ 0.0   ┆ 0.0 ┆ 0.0 ┆ 0.0  │
│ 2         ┆ ISIC_0024308 ┆ data\ISIC2018_Task3_Training_I… ┆ 0.0 ┆ … ┆ 0.0   ┆ 0.0 ┆ 0.0 ┆ 0.0  │
│ 3         ┆ ISIC_0024309 ┆ data\ISIC2018_Task3_Train

In [4]:

pipeline.apply_to_image_and_save('image_path', segmentation.apply_clahe, 'clahe', on_split=['test', 'valid', 'train'])

Column 'clahe' added: 0 processed, 11720 skipped (11720 total)


<utils.Pipeline at 0x29f14667020>

In [5]:
pipeline.apply_to_image_and_save('clahe', segmentation.apply_median_blur, 'median_blur', on_split=['test', 'valid', 'train'])

Column 'median_blur' added: 0 processed, 11720 skipped (11720 total)


<utils.Pipeline at 0x29f14667020>

In [6]:
pipeline.apply_to_image_and_save('median_blur', segmentation.apply_otsu_threshold, 'otsu', on_split=['test', 'valid', 'train'])

Column 'otsu' added: 0 processed, 11720 skipped (11720 total)


<utils.Pipeline at 0x29f14667020>

In [7]:
pipeline.apply_to_image_and_save('otsu', segmentation.apply_morph, 'morph', on_split=['test', 'valid', 'train'])

Column 'morph' added: 0 processed, 11720 skipped (11720 total)


<utils.Pipeline at 0x29f14667020>

In [8]:
pipeline.apply_to_image_and_save('morph', segmentation.find_max_contour, 'contour', on_split=['test', 'valid', 'train'])

Column 'contour' added: 0 processed, 11720 skipped (11720 total)


<utils.Pipeline at 0x29f14667020>

In [9]:
pipeline.apply_to_image_and_save('contour', segmentation.find_convex_hull, 'convex_hull', on_split=['test', 'valid', 'train'])

Column 'convex_hull' added: 0 processed, 11720 skipped (11720 total)


<utils.Pipeline at 0x29f14667020>

In [10]:
pipeline.apply_to_columns(['image_path', 'convex_hull'], feature_engineering.grayscale_variance, 'grayscale_variance', on_split=['test', 'valid', 'train'])

Column 'grayscale_variance' added from 2 input column(s)


<utils.Pipeline at 0x29f14667020>

In [11]:
pipeline.apply_to_columns(['image_path', 'convex_hull'], feature_engineering.color_variance, 'color_variance', on_split=['test', 'valid', 'train'])

Column 'color_variance' added from 2 input column(s)


<utils.Pipeline at 0x29f14667020>

In [12]:
pipeline.apply_to_columns_multi_output(['image_path', 'convex_hull'], feature_engineering.color_histogram, ['color_histogram_b', 'color_histogram_g', 'color_histogram_r'], on_split=['test', 'valid', 'train'])

3 column(s) added: color_histogram_b, color_histogram_g, color_histogram_r


<utils.Pipeline at 0x29f14667020>

In [13]:
pipeline.apply_to_column('contour', feature_engineering.contour_approximate, 'contour_approximate', on_split=['test', 'valid', 'train'])

<utils.Pipeline at 0x29f14667020>

In [14]:
pipeline.apply_to_column('contour', feature_engineering.contour_solidity, 'contour_solidity', on_split=['test', 'valid', 'train'])

<utils.Pipeline at 0x29f14667020>

In [15]:
pipeline.apply_to_column('contour', feature_engineering.contour_circularity, 'contour_circularity', on_split=['test', 'valid', 'train'])

<utils.Pipeline at 0x29f14667020>

In [16]:
pipeline.print_df()

Train DataFrame shape: (10015, 24)
Columns: ['sample_id', 'image', 'image_path', 'MEL', 'NV', 'BCC', 'AKIEC', 'BKL', 'DF', 'VASC', 'clahe', 'median_blur', 'otsu', 'morph', 'contour', 'convex_hull', 'grayscale_variance', 'color_variance', 'color_histogram_b', 'color_histogram_g', 'color_histogram_r', 'contour_approximate', 'contour_solidity', 'contour_circularity']
shape: (5, 24)
┌───────────┬────────────┬────────────┬─────┬───┬────────────┬────────────┬────────────┬───────────┐
│ sample_id ┆ image      ┆ image_path ┆ MEL ┆ … ┆ color_hist ┆ contour_ap ┆ contour_so ┆ contour_c │
│ ---       ┆ ---        ┆ ---        ┆ --- ┆   ┆ ogram_r    ┆ proximate  ┆ lidity     ┆ ircularit │
│ i64       ┆ str        ┆ str        ┆ f64 ┆   ┆ ---        ┆ ---        ┆ ---        ┆ y         │
│           ┆            ┆            ┆     ┆   ┆ array[f32, ┆ i64        ┆ f64        ┆ ---       │
│           ┆            ┆            ┆     ┆   ┆ 32]        ┆            ┆            ┆ f64       │
╞═══════════

In [17]:
pipeline.train_df.write_parquet(Path('data/train_segmentation_features.parquet'))
pipeline.valid_df.write_parquet(Path('data/valid_segmentation_features.parquet'))
pipeline.test_df.write_parquet(Path('data/test_segmentation_features.parquet'))