In [1]:
# -*- coding: utf-8 -*-
import sys; print('Python %s on %s' % (sys.version, sys.platform))
import os
import time
import json
from glob import glob, iglob
from tqdm import tqdm
import matplotlib.pyplot as plt

import numpy as np; print('numpy', np.__version__)
import pandas as pd; print('pandas', pd.__version__)
import cv2; print('opencv2', cv2.__version__)
import sklearn; print('sklearn', sklearn.__version__)
import tensorflow as tf; print('tensorflow', tf.__version__)
import tensorflow.keras as keras; print('keras', keras.__version__)

import settings
import helper
import visual

Python 3.6.8 (default, Jan 14 2019, 11:02:34) 
[GCC 8.0.1 20180414 (experimental) [trunk revision 259383]] on linux
numpy 1.16.4
pandas 0.24.2
opencv2 4.1.0
sklearn 0.21.2
tensorflow 1.14.0
keras 2.2.4-tf


# 1. Load Meta File and Annotation

In [2]:
df_meta_train = pd.read_csv(filepath_or_buffer=settings.PREPROCESS_TRAIN_META_FILE, index_col=['seriesuid'])
df_meta_train.index = df_meta_train.index.astype('str')
df_meta_test = pd.read_csv(filepath_or_buffer=settings.PREPROCESS_TEST_META_FILE, index_col=['seriesuid'])
df_meta_test.index = df_meta_test.index.astype('str')

print('meta_train:', df_meta_train.shape, '\n', 'meta_test:', df_meta_test.shape)

meta_train: (1470, 11) 
 meta_test: (222, 11)


In [3]:
df_annotation = pd.read_csv(filepath_or_buffer=settings.PREPROCESS_ANNOTATION_FILE, index_col=['seriesuid'])
df_annotation.index = df_annotation.index.astype('str')
df_annotation['label'] = df_annotation['label'].astype('int')
print('annotation:', df_annotation.shape, 'distinct lung:', len(set(df_annotation.index)))

annotation: (12218, 16) distinct lung: 1436


# 2. Predicted Results Analytic (lung)

In [4]:
wtype = 'lung'

In [5]:
df_results_d = pd.read_csv(filepath_or_buffer=settings.SUBMISSION_DIR + 'train/tasks_' + wtype +'.csv', 
                            index_col=['seriesuid'])
df_results_d.index = df_results_d.index.astype('str')

print('results:', df_results_d.shape, 'distinct lung:', len(set(df_results_d.index)))

results: (5872, 7) distinct lung: 100


In [6]:
df_annotation_lung = df_annotation.loc[set(df_results_d.index)]
df_annotation_lung = df_annotation_lung[(df_annotation_lung['label']==1)|(df_annotation_lung['label']==5)]
print(df_annotation_lung.shape, len(set(df_annotation_lung.index)))

(457, 16) 92


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


In [7]:
list_pos = []
list_anno = []
list_neg = []
for uid in tqdm(set(df_results_d.index)):
    if uid not in df_annotation_lung.index:
        continue
        
    predicts = df_results_d.loc[[uid]]
    labels = df_annotation_lung.loc[[uid]]
    meta = df_meta_train.loc[uid] # slice
    
    if len(predicts) == 0 or len(labels) == 0:
        continue
        
    for _, c in predicts.iterrows():
        vcoordX, vcoordY, vcoordZ = c.vcoordX, c.vcoordY, c.vcoordZ
        
        flag_pass = False
        for _, l in labels.iterrows():          
            dist_c = np.linalg.norm(np.array([vcoordX, vcoordY, vcoordZ]) - np.array([l.vcoordX, l.vcoordY, l.vcoordZ]))
            dist_d = np.linalg.norm(np.array([l.diameterX/2, l.diameterY/2, l.diameterZ/2]))
            if (dist_c - dist_d) <= 0:
                flag_pass = True
                list_anno.append(l)
                list_pos.append((c, l['label'])) 
                break
        
        if not flag_pass:
            list_neg.append((c, 0))
    

100%|██████████| 100/100 [00:07<00:00,  6.38it/s]


In [8]:
df_anno_found = pd.DataFrame(list_anno)
df_anno_found = df_anno_found.drop_duplicates()

print('pos/total:', len(list_pos), '/', df_results_d.shape[0])
print('distinct anno:', df_anno_found.shape[0], '/', len(list_anno), '->', df_annotation_lung.shape[0])

pos/total: 382 / 5872
distinct anno: 201 / 382 -> 457


In [9]:
recall = len(df_anno_found) / len(df_annotation_lung)
precison = len(list_pos) / len(df_results_d)
print('recall:', recall)
print('precison:', precison)
print('f1:', 2*recall*precison/(recall+precison))

recall: 0.43982494529540483
precison: 0.06505449591280654
f1: 0.11334424724286007


recall: 0.43982494529540483
precison: 0.06505449591280654
f1: 0.11334424724286007

## Classification

In [10]:
df_results_c = pd.read_csv(filepath_or_buffer=settings.SUBMISSION_DIR + 'train/submission_' + wtype +'.csv', 
                           index_col=['seriesuid'])
df_results_c.index = df_results_c.index.astype('str')

print('results:', df_results_c.shape, 'distinct lung:', len(set(df_results_c.index)))

results: (1412, 6) distinct lung: 100


In [11]:
threshold_probability = 0.95
df_results_c = df_results_c[df_results_c['probability'] >= threshold_probability]
print(df_results_c.shape)

(627, 6)


In [12]:
list_tp = []
list_anno_c = []
list_err = []
for uid in tqdm(set(df_results_c.index)):
    if uid not in df_annotation_lung.index:
        continue
        
    predicts = df_results_c.loc[[uid]]
    labels = df_annotation_lung.loc[[uid]]
    meta = df_meta_train.loc[uid] # slice
    
    if len(predicts) == 0 or len(labels) == 0:
        continue
        
    for _, c in predicts.iterrows():
        vcoordX, vcoordY, vcoordZ = c.coordX - meta.originX, c.coordY - meta.originY, c.coordZ - meta.originZ
        
        flag_pass = False
        for _, l in labels.iterrows():
            dist_c = np.linalg.norm(np.array([vcoordX, vcoordY, vcoordZ]) - np.array([l.vcoordX, l.vcoordY, l.vcoordZ]))
            dist_d = np.linalg.norm(np.array([l.diameterX/2, l.diameterY/2, l.diameterZ/2]))
            if (dist_c - dist_d) <= 0:
                if int(c['class']) == int(l.label):
                    list_anno_c.append(l)
                    list_tp.append(c)
                    break
                else:
                    print(c['class'], l['label'])
                    list_err.append(c)
                    break
    

 80%|████████  | 78/97 [00:00<00:00, 99.10it/s] 

1 5.0
1 5.0
1 5.0
1 5.0


100%|██████████| 97/97 [00:01<00:00, 88.66it/s]


In [13]:
df_anno_found = pd.DataFrame(list_anno_c)
df_anno_found = df_anno_found.drop_duplicates()

print('pos / total:', len(list_tp), '/', df_results_c.shape[0])
print('fpos(label) / total:', len(list_err), '/', df_results_c.shape[0])
print('distinct anno:', df_anno_found.shape[0], '/', len(list_anno_c), '->', df_annotation_lung.shape[0])

pos / total: 147 / 627
fpos(label) / total: 4 / 627
distinct anno: 101 / 147 -> 457


In [14]:
recall = len(df_anno_found) / len(df_annotation_lung)
precison = len(list_tp) / len(df_results_c)
print('recall:', recall)
print('precison:', precison)
print('f1:', 2*recall*precison/(recall+precison))

recall: 0.2210065645514223
precison: 0.23444976076555024
f1: 0.22752976874626454


# 3. Predicted Results Analytic (medi)

In [15]:
wtype = 'medi'

In [16]:
df_results_d = pd.read_csv(filepath_or_buffer=settings.SUBMISSION_DIR + 'train/tasks_' + wtype +'.csv', 
                            index_col=['seriesuid'])
df_results_d.index = df_results_d.index.astype('str')

print('results:', df_results_d.shape, 'distinct lung:', len(set(df_results_d.index)))

results: (25558, 7) distinct lung: 100


In [17]:
df_annotation_medi = df_annotation.loc[set(df_results_d.index)]
df_annotation_medi = df_annotation_medi[(df_annotation_medi['label']==31)|(df_annotation_medi['label']==32)]
print(df_annotation_medi.shape, len(set(df_annotation_medi.index)))

(252, 16) 66


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


In [18]:
list_pos = []
list_anno = []
list_neg = []
for uid in tqdm(set(df_results_d.index)):
    if uid not in df_annotation_medi.index:
        continue
        
    predicts = df_results_d.loc[[uid]]
    labels = df_annotation_medi.loc[[uid]]
    meta = df_meta_train.loc[uid] # slice
    
    if len(predicts) == 0 or len(labels) == 0:
        continue
        
    for _, c in predicts.iterrows():
        vcoordX, vcoordY, vcoordZ = c.vcoordX, c.vcoordY, c.vcoordZ
        
        flag_pass = False
        for _, l in labels.iterrows():          
            dist_c = np.linalg.norm(np.array([vcoordX, vcoordY, vcoordZ]) - np.array([l.vcoordX, l.vcoordY, l.vcoordZ]))
            dist_d = np.linalg.norm(np.array([l.diameterX/2, l.diameterY/2, l.diameterZ/2]))
            if (dist_c - dist_d) <= 0:
                flag_pass = True
                list_anno.append(l)
                list_pos.append((c, l['label'])) 
                break
        
        if not flag_pass:
            list_neg.append((c, 0))
    

100%|██████████| 100/100 [00:17<00:00,  3.62it/s]


In [19]:
df_anno_found = pd.DataFrame(list_anno)
df_anno_found = df_anno_found.drop_duplicates()

print('pos/total:', len(list_pos), '/', df_results_d.shape[0])
print('distinct anno:', df_anno_found.shape[0], '/', len(list_anno), '->', df_annotation_medi.shape[0])

pos/total: 243 / 25558
distinct anno: 116 / 243 -> 252


In [20]:
recall = len(df_anno_found) / len(df_annotation_medi)
precison = len(list_pos) / len(df_results_d)
print('recall:', recall)
print('precison:', precison)
print('f1:', 2*recall*precison/(recall+precison))

recall: 0.4603174603174603
precison: 0.009507786211753659
f1: 0.01863075700834511


In [21]:
df = df_anno_found.copy()

## Classification

In [22]:
df_results_c = pd.read_csv(filepath_or_buffer=settings.SUBMISSION_DIR + 'train/submission_' + wtype +'.csv', 
                           index_col=['seriesuid'])
df_results_c.index = df_results_c.index.astype('str')

print('results:', df_results_c.shape, 'distinct lung:', len(set(df_results_c.index)))

results: (928, 6) distinct lung: 96


In [23]:
threshold_probability = 0.7
df_results_c = df_results_c[df_results_c['probability'] >= threshold_probability]
print(df_results_c.shape)

(928, 6)


In [24]:
list_tp = []
list_anno_c = []
list_err = []
for uid in tqdm(set(df_results_c.index)):
    if uid not in df_annotation_medi.index:
        continue
        
    predicts = df_results_c.loc[[uid]]
    labels = df_annotation_medi.loc[[uid]]
    meta = df_meta_train.loc[uid] # slice
    
    if len(predicts) == 0 or len(labels) == 0:
        continue
        
    for _, c in predicts.iterrows():
        vcoordX, vcoordY, vcoordZ = c.coordX - meta.originX, c.coordY - meta.originY, c.coordZ - meta.originZ
        
        flag_pass = False
        for _, l in labels.iterrows():
            dist_c = np.linalg.norm(np.array([vcoordX, vcoordY, vcoordZ]) - np.array([l.vcoordX, l.vcoordY, l.vcoordZ]))
            dist_d = np.linalg.norm(np.array([l.diameterX/2, l.diameterY/2, l.diameterZ/2]))
            if (dist_c - dist_d) <= 0:
                if int(c['class']) == int(l.label):
                    list_anno_c.append(l)
                    list_tp.append(c)
                    break
                else:
                    print(c['class'], l['label'])
                    list_err.append(c)
                    break
    

 76%|███████▌  | 73/96 [00:00<00:00, 84.06it/s]

32 31.0
32 31.0
32 31.0
32 31.0
31 32.0
32 31.0


100%|██████████| 96/96 [00:01<00:00, 84.60it/s]


In [25]:
df_anno_found = pd.DataFrame(list_anno_c)
df_anno_found = df_anno_found.drop_duplicates()

print('pos / total:', len(list_tp), '/', df_results_c.shape[0])
print('fpos(label) / total:', len(list_err), '/', df_results_c.shape[0])
print('distinct anno:', df_anno_found.shape[0], '/', len(list_anno_c), '->', df_annotation_medi.shape[0])

pos / total: 225 / 928
fpos(label) / total: 6 / 928
distinct anno: 112 / 225 -> 252


In [26]:
recall = len(df_anno_found) / len(df_annotation_medi)
precison = len(list_tp) / len(df_results_c)
print('recall:', recall)
print('precison:', precison)
print('f1:', 2*recall*precison/(recall+precison))

recall: 0.4444444444444444
precison: 0.24245689655172414
f1: 0.31375283249084884


recall: 0.4603174603174603
precison: 0.03868797308662742
f1: 0.07137697637711016