Using the other object as test set 🤪

In [41]:
# Bonus cell for using the different test set

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pathlib
from copy import copy
from matplotlib import cm, colors
import cv2

objectwidth = 100
objectheight = 100
xspacing = 116
yspacing = 300
xstart = 293
ystart = 445
xend = 1730
yend = 1770
powderthickness = 80
endlayer = 225

paths = pathlib.Path('./OT data 80 um/int').glob('*.tif')
paths_sorted = [x for x in paths]
paths_sorted.sort()
integrals = np.array([np.array(plt.imread(path)) for path in paths_sorted])

objectinfo = pd.read_csv('Parameters2.csv', names=["Object", "P", "S", "H", "Porosity", "Label"])
objectinfo.insert(1, "VED", objectinfo.P * 1000/(objectinfo.S * objectinfo.H * powderthickness))
objectCoordinates = [[x, x+objectwidth, y, y+objectheight] for y in reversed(range(
    ystart, yend, objectheight + yspacing)) for x in range(xstart, xend, xspacing + objectwidth)]
coorddf = pd.DataFrame(objectCoordinates, columns=['xstart', 'xend', 'ystart', 'yend'])
objectinfo = coorddf.join(objectinfo)

del coorddf
del objectCoordinates
del paths_sorted

objectinfo.drop(objectinfo[objectinfo.Label == 'KH'].index, inplace=True)
objectinfo.replace('GOOD', 0, inplace=True)
objectinfo.replace('LOF', 1, inplace=True)
objectinfo.reset_index(drop=True, inplace=True)

objects = np.full((len(objectinfo), endlayer, objectheight, objectwidth), np.nan)

for index, object in objectinfo.iterrows():
    objects[index] = integrals[:, object.ystart:object.yend, object.xstart:object.xend]
del integrals

aggregate = np.sum(objects, axis=(0))

emptyRatio = 30
limit = np.percentile(aggregate, emptyRatio)
testmask = aggregate >= limit

testobjects = objects
testobjectinfo = objectinfo
del objects
del objectinfo

In [42]:
# Read train data

emptyRatio = 47
objectwidth = 83
objectheight = 122
xspacing = 133
yspacing = 270
xstart = 293
ystart = 268
xend = 1730
yend = 1770
powderthickness = 80
endlayer = 187

paths = pathlib.Path('./OT data 80 um/int').glob('*.tif')
paths_sorted = [x for x in paths]
paths_sorted.sort()
block = np.array([np.array(plt.imread(path)) for path in paths_sorted])
integrals = block[0:endlayer]

del paths_sorted
objectinfo = pd.read_csv('Parameters.csv', names=["Object", "P", "S", "H", "Porosity", "Label"])

objectsplit = 3
upsamplingratio = 4
layersPerObject = endlayer // objectsplit
positive_multiplier = 1

# Approximate one third test data
testEnd = endlayer

objectinfo.insert(1, "VED", objectinfo.P * 1000/(objectinfo.S * objectinfo.H * powderthickness))
objectCoordinates = [[x, x+objectwidth, y, y+objectheight] for y in reversed(range(
    ystart, yend, objectheight + yspacing)) for x in range(xstart, xend, xspacing + objectwidth)]
coorddf = pd.DataFrame(objectCoordinates, columns=['xstart', 'xend', 'ystart', 'yend'])
objectinfo = coorddf.join(objectinfo)
objectinfo.drop(objectinfo[objectinfo.Label == 'KH'].index, inplace=True)
objectinfo.replace('GOOD', 0, inplace=True)
objectinfo.replace('LOF', 1, inplace=True)

zs = [objectinfo.copy().assign(zstart=z, zend=z+layersPerObject) for z in range(0, testEnd-layersPerObject, layersPerObject//2)]
trainobjectinfo = pd.concat(zs, ignore_index=True)

# This line removes all the lines from the dataframe that aren't created because of the positive_multiplier
trainobjectinfo = trainobjectinfo[(trainobjectinfo['Label'] == 1) | (trainobjectinfo['zstart'] % (positive_multiplier) == 0)]

trainobjectinfo.reset_index(drop=True, inplace=True)

del zs
del coorddf
del objectCoordinates
del objectinfo

trainobjects = np.full((len(trainobjectinfo), layersPerObject, objectheight, objectwidth), np.nan)

for index, object in trainobjectinfo.iterrows():
    trainobjects[index] = integrals[object.zstart:object.zend, object.ystart:object.yend, object.xstart:object.xend]

aggregate = np.sum(trainobjects, axis=(0,1))

emptyRatio = 47
limit = np.percentile(aggregate, emptyRatio)
trainmask = aggregate >= limit
trainmask = np.repeat([trainmask], layersPerObject, 0)
del aggregate
del limit
del integrals
del block
del paths
# xs = np.copy(aggregate)
# xs[~backgroundmask] = np.nan
# plt.imshow(xs)
# plt.figure()

In [43]:
from sklearn import neighbors, metrics
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn import preprocessing
from datetime import datetime

def preprocess(objects, type, sharpening, backgroundmask):
    rtn = np.full(objects.shape, np.nan)
    # print(rtn.shape)
    for index, object in enumerate(objects):
        sharpeningKernel = np.array([   [-1, -1,  -1],
                                        [-1,  9,  -1],
                                        [ -1, -1,  -1]
        ]) if sharpening == 'diagonal' else np.array([  [0, -1,  0],
                                                        [-1, 5, -1],
                                                        [0, -1,  0]])
        sharpened = np.array([cv2.filter2D(src=image, ddepth=-1, kernel=sharpeningKernel) for image in object])
    # Sharpening is done
        if type == 'scatter' or type == 'spatstat':
            xs = np.array(sharpened, copy=True, dtype=np.float32)
            (endLayer, _, _) = xs.shape
            xs[~backgroundmask] = np.nan
            rtn[index] = xs
        elif type == 'moran':
            xs = np.array(sharpened, copy=True, dtype=np.float32)
            (endLayer, _, _) = xs.shape
            avg = np.mean(xs, where=backgroundmask)
            stddev = np.std(xs, where=backgroundmask)
            xs = (xs - avg) / avg
            xs[~backgroundmask] = np.nan
            rtn[index] = xs
    return rtn


def calculateoutliers(objects, type, neighbourhoodSetting, windowSize):
    c, z, y, x = objects.shape

    outlierValues = np.full((c, z + 1 - windowSize, y, x), np.nan)
    for index, object in enumerate(objects):
        # Step 1: calculate neighbourhood
        neighbourkernel = np.array(
            [[1, 1, 1],
            [1, 1, 1],
            [1, 1, 1]]
        )/9 if neighbourhoodSetting == 'grid' else np.array(
            [[1, 2,  1],
            [2, 4, 2],
            [1, 2,  1]])/16
        flatNeighbourhood = np.array([cv2.filter2D(src=layer, ddepth=-1, kernel=neighbourkernel) for layer in object])
        neighbourhoodValues = np.array([
            np.sum(flatNeighbourhood[layerIndex-windowSize:layerIndex], axis=0)/windowSize
            for layerIndex in range(windowSize, z+1)
        ])
        # Step 2: calculate outlier
        offset = windowSize // 2
        endoffset = windowSize - offset - 1

        xs = object[offset:z-endoffset]
        ys = neighbourhoodValues[0:z-windowSize+1]
        filter = np.logical_and(np.isfinite(xs), np.isfinite(ys))

        assert(len(set(filter.flatten())) == 2)
        if type == 'spatstat':
            outliers = xs - ys
            avg = np.mean(outliers[filter])
            std = np.std(outliers[filter])
            outliers = (outliers - avg) / std
            outlierValues[index] = outliers
        else:
            # Is the axis right for moran/scatter?
            # print(offset, endoffset, object.shape)
            line = np.polyfit(ys[filter].flatten(), xs[filter].flatten(), 1)
            p = np.poly1d(line)
            outlierValues[index] = xs - p(ys)
            assert(outlierValues[index].shape == xs.shape == p(ys).shape)
        assert(len(np.unique(outlierValues[index])) > 1)
        assert(len(np.unique(np.isfinite(outlierValues[index]))) == 2)
    assert(np.average(np.isfinite(outlierValues)) > 0.4)
    return outlierValues

def encode(outlierobjects, type, buckets, histnormalise, minval=0, maxval=0):
    numberOfObjects, _, _, _ = outlierobjects.shape
    X = np.full((numberOfObjects, buckets), np.nan)
    filter = np.isfinite(outlierobjects)
    minval = np.min(outlierobjects[filter]) if minval == 0 else minval
    maxval = np.max(outlierobjects[filter]) if maxval == 0 else maxval
    for index in range(0, numberOfObjects):
        xs = outlierobjects[index]
        filter = np.isfinite(xs)
        hist, _ = np.histogram(xs[filter], bins=buckets, range=(minval, maxval), density=True)
        X[index] = np.array(hist)
    
    if (histnormalise == 'column'):
        X = preprocessing.normalize(X, axis=0)
    elif (histnormalise == 'row'):
        X = preprocessing.normalize(X, axis=1)
    return X, minval, maxval

def classify(Xtrain, Ytrain, Xtest, Ytest, n_neighbors):
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights="distance")
    cvs = cross_val_score(clf, Xtrain, Ytrain, cv=5, scoring='roc_auc', n_jobs=-1)
    clf.fit(Xtrain, Ytrain)
    yfit = clf.predict(Xtest)
    return cvs.mean() - cvs.std(), metrics.roc_auc_score(Ytest, yfit), metrics.precision_score(Ytest, yfit, zero_division=0), metrics.recall_score(Ytest, yfit, zero_division=0)

In [48]:
# parameter settings
types = ['scatter', 'spatstat', 'moran', ]
sharpening = ['direct', 'diagonal']
windowsizes = [1, 3, 5, 7]
neighbourhoodSetting = ['grid', 'euclidean']
bins = [30, 60, 90, 120, 150]
histnormalise = ['none', 'row']
k = [2,3,4,5]

In [49]:
# Now to tie it all together...
columns = ['type', 'sharpening', 'windowSize', 'neighbourhood', 'buckets', 'histnormalise', 'k-nearest', 'cv-auc', 'test-auc', 'test-precision', 'test-recall']
results = pd.DataFrame(columns=columns)
Ytrain = np.array(trainobjectinfo.loc[:,"Label"])
Ytest = np.array(testobjectinfo.loc[:,"Label"])
for type in types:
    for sharpSetting in sharpening:
        print(datetime.now().strftime("%H:%M:%S"), "Processing type: ", type, " sharpening: ", sharpSetting)
        # trainpreprocessed = np.apply_along_axis(preprocess, 1, trainobjects, type, sharpSetting)
        trainpreprocessed =preprocess(trainobjects, type, sharpSetting, trainmask)
        testpreprocessed =preprocess(testobjects, type, sharpSetting, testmask)
        for windowsize in windowsizes:
            for nSetting in neighbourhoodSetting:
                print(datetime.now().strftime("%H:%M:%S"), "Processing windowSize: ", windowsize, " neighbourhood: ", nSetting)
                # trainoutliers = np.apply_along_axis(calculateoutliers, 1, trainpreprocessed, type, nSetting, windowsize)
                trainoutliers = calculateoutliers(trainpreprocessed, type, nSetting, windowsize)
                testoutliers = calculateoutliers(testpreprocessed, type, nSetting, windowsize)
                for histnorm in histnormalise:
                    for bucket in buckets:
                        print(datetime.now().strftime("%H:%M:%S"), "Processing histnorm: ", histnorm, " bins: ", bucket)
                        Xtrain, minval, maxval = encode(trainoutliers, type, bucket, histnorm)
                        Xtest, _, _ = encode(testoutliers, type, bucket, histnorm, minval=minval, maxval=maxval)
                        # Add encode for test as well
                        for n_neighbors in k:
                            cvscore, score, precision, recall = classify(Xtrain, Ytrain, Xtest, Ytest, n_neighbors)
                            nextRow =  pd.DataFrame([[type, sharpSetting, windowsize, nSetting, bucket, histnorm, n_neighbors, cvscore, score, precision, recall]], columns=columns)
                            results = pd.concat([results, nextRow])
                        results.to_csv('out14.csv', index=False, header=True)


09:10:17 Processing type:  scatter  sharpening:  direct
09:10:19 Processing windowSize:  1  neighbourhood:  grid
09:10:36 Processing histnorm:  none  bins:  30
09:10:39 Processing histnorm:  none  bins:  60
09:10:41 Processing histnorm:  none  bins:  90
09:10:42 Processing histnorm:  none  bins:  120
09:10:44 Processing histnorm:  none  bins:  150
09:10:45 Processing histnorm:  row  bins:  30
09:10:46 Processing histnorm:  row  bins:  60
09:10:48 Processing histnorm:  row  bins:  90
09:10:49 Processing histnorm:  row  bins:  120
09:10:50 Processing histnorm:  row  bins:  150
09:10:52 Processing windowSize:  1  neighbourhood:  euclidean
09:11:09 Processing histnorm:  none  bins:  30
09:11:10 Processing histnorm:  none  bins:  60
09:11:12 Processing histnorm:  none  bins:  90
09:11:13 Processing histnorm:  none  bins:  120
09:11:14 Processing histnorm:  none  bins:  150
09:11:15 Processing histnorm:  row  bins:  30
09:11:17 Processing histnorm:  row  bins:  60
09:11:18 Processing histnor