# Imports

In [1]:
use_kaggle=False
use_jupyter=True

In [2]:
if use_jupyter:
    print("Using Jupyter")
    !dir "C:/develop/python/Final Project/Data/"
elif use_kaggle:
    print("Using Kaggle")
    !ls "/kaggle/"
else:
    print("Using Colab with drive")
    from google.colab import drive
    drive.mount('/content/drive')
    !ls "/content/drive/My Drive/Colab Notebooks/ML course/"

Using Jupyter
 Volume in drive C has no label.
 Volume Serial Number is B0A9-3BF3

 Directory of C:\develop\python\Final Project\Data

09/04/2020  18:31    <DIR>          .
09/04/2020  18:31    <DIR>          ..
09/04/2020  18:21           282,513 15001-20000_filtered_cells.csv
09/04/2020  11:29         9,955,043 Data.zip
07/04/2020  16:24    <DIR>          extracted-masks-images
07/04/2020  16:24         1,856,537 extracted_cells.csv
07/04/2020  15:43    <DIR>          masks-images
07/04/2020  15:43    <DIR>          resized-images
               3 File(s)     12,094,093 bytes
               5 Dir(s)  446,597,439,488 bytes free


In [3]:
import numpy as np
import pandas as pd
import uuid
import os
import csv
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import time
import cv2
from skimage import io
from skimage import data, color, img_as_ubyte
from skimage.feature import canny
from skimage.transform import hough_ellipse
from skimage.draw import ellipse_perimeter, ellipse

# Consts and Setups

In [4]:
if use_jupyter:
    root_input_dir = 'C:/develop/python/Final Project/Data/'
    root_output_dir = 'C:/develop/python/Final Project/Data/'
elif use_kaggle:
    root_input_dir = '/kaggle/input/'
    root_output_dir = '/kaggle/working/output/'
else:
    root_input_dir = '/content/drive/My Drive/Colab Notebooks/ML course/Final Project - BioCell/'
    root_output_dir = '/content/drive/My Drive/Colab Notebooks/ML course/Final Project - BioCell/'

In [5]:
cell_masks_subdirectory = 'extracted-masks-images'
minimum_cover_percent = 89
max_factor_size = 4
uuid_row_index = 1
input_csv_filename = 'extracted_cells.csv'
output_csv_filename = 'filtered_cells.csv'
images_extension = ".png"
csv_field_names = ['source_file', 'mask_uuid', 'left_x', 'right_x', 'top_y', 'bottom_y', 'id']
start_line=20001
end_line=21122

In [6]:
plt.gray()
np.seterr(divide='ignore', invalid='ignore')

# Inputs
input_csv_file_path = root_input_dir + input_csv_filename
masks_images_path = root_input_dir + cell_masks_subdirectory + '/'
input_csv_rows_list = []

# Outputs
output_csv_rows_list = []
output_csv_file_path = root_output_dir + f'{start_line}-{end_line}_' + output_csv_filename

<Figure size 432x288 with 0 Axes>

# Function Definition

In [7]:
def getMaskBestEllipse(mask_data):
    edges = canny(mask_data, sigma=2.0, low_threshold=0.55, high_threshold=0.8)
    ellipses_data = hough_ellipse(edges, accuracy=10, threshold=40, min_size=0, max_size=500)
    ellipses_data.sort(order='accumulator')
    best = list(ellipses_data[-1])
    yc, xc, a, b = [int(round(x)) for x in best[1:5]]
    orientation = best[5]
    return yc, xc, a, b, orientation

In [8]:
def resizeImage(image_data, scale_size):
    width = int(current_mask_data.shape[1] * scale_size)
    height = int(current_mask_data.shape[0] * scale_size)
    dim = (width, height)
    # resize image
    return cv2.resize(current_mask_data, dim, interpolation = cv2.INTER_AREA)

In [9]:
def getEllipseCoverPercent(mask_data, factor_size):
    # Resize the image first
    resized_image = resizeImage(mask_data, factor_size)
    mask_copy = resized_image.copy()

    # Get the best ellipse that covers the mask
    yc, xc, a, b, orientation = getMaskBestEllipse(resized_image)

    # Try to draw the best ellipse on the mask
    try:
        # Get the indices of the ellipse fill and the ellipse perimiter
        perimiter_y, perimiter_x = ellipse_perimeter(yc, xc, a, b, orientation)
        fill_y, fill_x = ellipse(yc, xc, a, b, rotation=-orientation)

        # Set 0 (black) in the mask copy where the indexes are located
        mask_copy[list(perimiter_y), list(perimiter_x)] = 0
        mask_copy[list(fill_y), list(fill_x)] = 0
        
        # Get the number of nonzero pixels - that is the number of mask pixels that the ellipse does not cover
        full_mask_nonzero = np.count_nonzero(resized_image)
        ellipse_non_zero = np.count_nonzero(mask_copy)

        return ((full_mask_nonzero - ellipse_non_zero) / full_mask_nonzero) * 100
    except:
        print(f'Could not get ellipse for factor size {factor_size}')
        return 0

# Filter Logic

In [10]:
# TODO:
# Read the csv file, which is the output of the extraction module
# Loop over the lines and foreach line extract and read the mask image (using the uuid)
# get the ellipse data from the getMaskBestEllipse function
# set the mask copy values to 0 (black) in the ellipse area
# calculate the number of nonzero pixels in the mask copy
# if it exceeds a predefined value (max_white_pixel_outside_ellipse, need to decide what is the best value) - it's not a good filter
# if it does not exceed - a good filter, insert the line from the input csv to a new output csv

with open(input_csv_file_path) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    input_csv_rows_list = list(csv_reader)
    counter = start_line
    for line in input_csv_rows_list[start_line:end_line + 1]:
        start = time.time()
        counter = counter + 1
        # Get the mask data
        current_mask_uuid = line[uuid_row_index]
        current_mask_filename = current_mask_uuid + images_extension
        current_mask_data = io.imread(masks_images_path + current_mask_filename, as_gray=True)

        factor_size = 2
        ellipse_detected = False
        
        while factor_size <= max_factor_size and not ellipse_detected:
            # Get the coverage percent for the curretn sace factor
            current_cover_percent = getEllipseCoverPercent(current_mask_data, factor_size)

            # If the coverage percentage is greater than the wanted minimum - this is a good mask
            if current_cover_percent > minimum_cover_percent:
                ellipse_detected = True
                output_csv_rows_list.append({
                    "source_file": line[0],
                    "mask_uuid": line[1],
                    "left_x": line[2],
                    "right_x": line[3],
                    "top_y": line[4],
                    "bottom_y": line[5],
                    "id": line[6]
                })
            
            factor_size = factor_size + 1
        end = time.time()
        print(f'line number {counter} - time {end - start} seconds')
                    
                    
with open(output_csv_file_path, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=csv_field_names)
    print(f'Writing {len(output_csv_rows_list)} filtered lines to {output_csv_filename}')
    writer.writerows(output_csv_rows_list)

line number 20002 - time 5.775520086288452 seconds
line number 20003 - time 1.7602410316467285 seconds
line number 20004 - time 1.3529653549194336 seconds
line number 20005 - time 0.6273436546325684 seconds
line number 20006 - time 3.3115451335906982 seconds
line number 20007 - time 3.1943318843841553 seconds
line number 20008 - time 2.7674527168273926 seconds
line number 20009 - time 1.4570567607879639 seconds
line number 20010 - time 4.270030498504639 seconds
line number 20011 - time 0.48488926887512207 seconds
Could not get ellipse for factor size 2
line number 20012 - time 5.867037534713745 seconds
line number 20013 - time 1.7998123168945312 seconds
line number 20014 - time 0.4679844379425049 seconds
Could not get ellipse for factor size 3
line number 20015 - time 3.6142852306365967 seconds
line number 20016 - time 3.7694718837738037 seconds
line number 20017 - time 3.539569616317749 seconds
line number 20018 - time 9.712830781936646 seconds
line number 20019 - time 1.9572305679321

line number 20160 - time 2.5427558422088623 seconds
line number 20161 - time 2.1957085132598877 seconds
line number 20162 - time 2.453596830368042 seconds
line number 20163 - time 2.305368423461914 seconds
line number 20164 - time 1.8049132823944092 seconds
line number 20165 - time 0.3630375862121582 seconds
line number 20166 - time 2.6843931674957275 seconds
line number 20167 - time 2.8817718029022217 seconds
line number 20168 - time 4.498850345611572 seconds
line number 20169 - time 1.5280826091766357 seconds
line number 20170 - time 11.973897695541382 seconds
line number 20171 - time 2.1162922382354736 seconds
line number 20172 - time 0.442486047744751 seconds
line number 20173 - time 1.6625010967254639 seconds
line number 20174 - time 2.7270498275756836 seconds
line number 20175 - time 1.5207717418670654 seconds
line number 20176 - time 3.302562713623047 seconds
line number 20177 - time 0.3450748920440674 seconds
line number 20178 - time 0.42653870582580566 seconds
line number 2017

line number 20316 - time 4.030864953994751 seconds
line number 20317 - time 1.9608099460601807 seconds
line number 20318 - time 0.5690076351165771 seconds
line number 20319 - time 0.8929648399353027 seconds
line number 20320 - time 3.4175209999084473 seconds
line number 20321 - time 1.1141233444213867 seconds
line number 20322 - time 1.7948408126831055 seconds
line number 20323 - time 1.8041083812713623 seconds
Could not get ellipse for factor size 3
line number 20324 - time 5.633329153060913 seconds
line number 20325 - time 0.2972686290740967 seconds
line number 20326 - time 0.21535944938659668 seconds
line number 20327 - time 5.491539478302002 seconds
line number 20328 - time 2.186371088027954 seconds
line number 20329 - time 0.21689891815185547 seconds
line number 20330 - time 0.7332570552825928 seconds
line number 20331 - time 0.22734284400939941 seconds
line number 20332 - time 1.2776806354522705 seconds
line number 20333 - time 1.2247099876403809 seconds
line number 20334 - time 

line number 20471 - time 0.31102657318115234 seconds
line number 20472 - time 0.31066465377807617 seconds
line number 20473 - time 2.0009751319885254 seconds
line number 20474 - time 1.5866703987121582 seconds
line number 20475 - time 0.5064949989318848 seconds
line number 20476 - time 1.0753729343414307 seconds
line number 20477 - time 0.28899574279785156 seconds
line number 20478 - time 3.0915191173553467 seconds
line number 20479 - time 1.5993082523345947 seconds
line number 20480 - time 2.0861828327178955 seconds
line number 20481 - time 0.28706789016723633 seconds
line number 20482 - time 1.313469409942627 seconds
line number 20483 - time 1.126593828201294 seconds
line number 20484 - time 2.7747325897216797 seconds
line number 20485 - time 1.5691230297088623 seconds
line number 20486 - time 2.1481080055236816 seconds
line number 20487 - time 0.22630906105041504 seconds
line number 20488 - time 0.2881045341491699 seconds
line number 20489 - time 3.014885187149048 seconds
line numbe

line number 20626 - time 0.336193323135376 seconds
line number 20627 - time 0.3186216354370117 seconds
line number 20628 - time 0.36469578742980957 seconds
line number 20629 - time 0.29441046714782715 seconds
line number 20630 - time 0.311690092086792 seconds
line number 20631 - time 1.310755968093872 seconds
line number 20632 - time 2.81535005569458 seconds
line number 20633 - time 1.3074524402618408 seconds
line number 20634 - time 0.3766472339630127 seconds
line number 20635 - time 2.0831151008605957 seconds
line number 20636 - time 1.8453612327575684 seconds
line number 20637 - time 1.8225069046020508 seconds
line number 20638 - time 3.4674952030181885 seconds
line number 20639 - time 0.3605520725250244 seconds
line number 20640 - time 14.429987668991089 seconds
line number 20641 - time 3.165949821472168 seconds
line number 20642 - time 1.9510164260864258 seconds
line number 20643 - time 0.21898889541625977 seconds
line number 20644 - time 2.7591421604156494 seconds
line number 206

line number 20781 - time 1.592081069946289 seconds
line number 20782 - time 0.9009177684783936 seconds
line number 20783 - time 1.6149156093597412 seconds
Could not get ellipse for factor size 4
line number 20784 - time 6.489750623703003 seconds
line number 20785 - time 1.8160021305084229 seconds
line number 20786 - time 1.4674599170684814 seconds
line number 20787 - time 11.337825775146484 seconds
line number 20788 - time 1.039104700088501 seconds
line number 20789 - time 1.5021145343780518 seconds
line number 20790 - time 0.9142882823944092 seconds
Could not get ellipse for factor size 2
line number 20791 - time 2.6764309406280518 seconds
line number 20792 - time 1.781761646270752 seconds
line number 20793 - time 2.3324694633483887 seconds
line number 20794 - time 1.1971545219421387 seconds
line number 20795 - time 2.4860599040985107 seconds
line number 20796 - time 1.3703935146331787 seconds
line number 20797 - time 0.9873135089874268 seconds
line number 20798 - time 0.3226134777069

line number 20936 - time 1.9298491477966309 seconds
line number 20937 - time 0.9383172988891602 seconds
Could not get ellipse for factor size 2
line number 20938 - time 2.2536041736602783 seconds
line number 20939 - time 2.2084801197052 seconds
line number 20940 - time 0.9751965999603271 seconds
line number 20941 - time 0.3055534362792969 seconds
line number 20942 - time 3.7980470657348633 seconds
line number 20943 - time 1.7413394451141357 seconds
line number 20944 - time 1.074941873550415 seconds
line number 20945 - time 1.7517564296722412 seconds
line number 20946 - time 2.0401344299316406 seconds
line number 20947 - time 0.33485913276672363 seconds
line number 20948 - time 0.23161792755126953 seconds
line number 20949 - time 1.8992853164672852 seconds
Could not get ellipse for factor size 4
line number 20950 - time 3.320817708969116 seconds
Could not get ellipse for factor size 4
line number 20951 - time 21.028750896453857 seconds
line number 20952 - time 0.28026294708251953 second

line number 21092 - time 1.1972498893737793 seconds
line number 21093 - time 2.185279607772827 seconds
line number 21094 - time 10.09047818183899 seconds
line number 21095 - time 8.459338426589966 seconds
Could not get ellipse for factor size 3
line number 21096 - time 9.68746566772461 seconds
line number 21097 - time 1.4855270385742188 seconds
line number 21098 - time 3.0855836868286133 seconds
line number 21099 - time 1.9440772533416748 seconds
line number 21100 - time 5.762082099914551 seconds
line number 21101 - time 1.2659547328948975 seconds
line number 21102 - time 4.21864652633667 seconds
line number 21103 - time 1.3950109481811523 seconds
line number 21104 - time 0.3800179958343506 seconds
line number 21105 - time 0.35462403297424316 seconds
line number 21106 - time 2.2997920513153076 seconds
line number 21107 - time 0.44277071952819824 seconds
line number 21108 - time 0.46878528594970703 seconds
line number 21109 - time 1.5093224048614502 seconds
line number 21110 - time 4.61