In [39]:
from sklearn import tree
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score, LeaveOneOut, train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn import neighbors, metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import copy
from matplotlib import cm, colors
import cv2
import warnings
import collections


# Takes a 2D numpy array as input (i.e. a numpy representation of an image)
# Different from tree since the empty ratio is different. Could implement a generic
# solution down the line
def processNextHouseImage(image): 
    objectwidth = 100
    objectheight = 100
    xspacing = 116
    yspacing = 300
    xstart = 293
    ystart = 445
    xend = 1730
    yend = 1770
    powderthickness = 80
    objectCoordinates = [[x, x+objectwidth, y, y+objectheight] for y in reversed(range(
        ystart, yend, objectheight + yspacing)) for x in range(xstart, xend, xspacing + objectwidth)]
    # Rest NYI

# Returns a list of images of each object, with the background filtered out
def processNextTreeImage(image):
    objectwidth = 83
    objectheight = 122
    xspacing = 133
    yspacing = 270
    xstart = 293
    ystart = 268
    xend = 1730
    yend = 1770

    # objectinfo = pd.read_csv('Parameters.csv', names=["Object", "P", "S", "H", "Porosity", "Label"])

    objectCoordinates = [[x, x+objectwidth, y, y+objectheight] for y in reversed(range(
        ystart, yend, objectheight + yspacing)) for x in range(xstart, xend, xspacing + objectwidth)]
    # coorddf = pd.DataFrame(objectCoordinates, columns=['xstart', 'xend', 'ystart', 'yend'])
    # objectinfo = coorddf.join(objectinfo)

    objects = np.full((len(objectCoordinates), objectheight, objectwidth), np.nan)

    for index, object in objectCoordinates.iterrows():
        objects[index] = image[object.ystart:object.yend, object.xstart:object.xend]

    # for object in objects:
    #     object[~mask] = np.nan

    return objects

class backgroundRemover:
    def __init__(self, mask):
        self.mask = mask

    def removeBackground(self, images):
        rtn = np.copy(images)
        for object in rtn:
            object[~self.mask] = np.nan
        return rtn

# Conducts preprocessing. OldObjectLayers is a list of objects, with each object being a 3d numpy array
# Lol, ta bort preprocess...
# Ok så basically, vill

class outlierCalculator:
    def __init__(self, type, neighbourhoodDistance, windowSize):
        self.type = type
        self.nbhd = neighbourhoodDistance
        self.ws = windowSize
        self.currentLayer = -1
        self.init = False

    def reset(self):
        self.init = False
        self.currentLayer = -1

    def calculate(self, objects):
        outlierValues = []
        for idc, objectLayers in enumerate(objects):
            objectLayers = np.copy(objectLayers)
            if self.type == 'moran':
                avg = np.nanmean(objectLayers)
                stddev = np.nanstd(objectLayers)
                objectLayers = (xs - avg) / stddev
            index = 0
            z, y, x = objectLayers.shape
            # Step 1: calculate neighbourhood
            neighbourkernel = np.ones((self.nbhd, self.nbhd)) / self.nbhd**2
            flatNeighbourhood = np.array([cv2.filter2D(src=layer, ddepth=-1, kernel=neighbourkernel) for layer in objectLayers])
            neighbourhoodValues = np.array([
                np.sum(flatNeighbourhood[layerIndex-self.ws:layerIndex], axis=0)/self.ws
                for layerIndex in range(self.ws, z+1)
            ])
            # Step 2: calculate outlier
            # This is different from batch processing (we're moving the center)

            ys = neighbourhoodValues
            xs = objectLayers[self.ws-1:z+1]
            filter = np.logical_and(np.isfinite(xs), np.isfinite(ys))

            numberOfFilterValues = len(np.unique(filter))
            assert numberOfFilterValues == 2, f"Expected filter to have two values, got: {numberOfFilterValues}"
            if self.type == 'spatstat':
                outliers = xs - ys
                avg = np.mean(outliers[filter])
                std = np.std(outliers[filter])
                outliers = (outliers - avg) / std
                outlierValues.append(outliers)
            else:
                with warnings.catch_warnings():
                    line = np.polyfit(xs[filter].flatten(), ys[filter].flatten(), 1)
                    p = np.poly1d(line)
                    outlierValues.append(p(xs) - ys)
                assert(xs.shape == p(ys).shape)
            assert(len(np.unique(outlierValues[index])) > 1)
            assert(len(np.unique(np.isfinite(outlierValues[index]))) == 2)
            # if(idc == 0 or idc == 1 or idc == 2):
            #     plt.hist(outlierValues[idc][0].flatten())
            #     plt.title("Histogram of outlier values for object " + str(idc))
            #     plt.figure()
        return outlierValues

class encoder:
    def __init__(self, noOfBins, minval=0, maxval=0):
        self.min = minval
        self.max = maxval
        self.buckets = noOfBins
        self.noOfLayers = 0
    
    # Vad returnerar den här? Ett histogram för hela objektet hittils.
    def encode(self, outlierobjects):
        self.noOfLayers += 1
        numberOfObjects = len(outlierobjects)
        # If not locked, store all values and recalculate histogram
        # If locked, store only histogramstuff
        # What do we emit here? I'd say we emit the complete histogram
        # Why raw? What I doing?
        returnData = np.zeros((numberOfObjects, self.buckets))
        for index in range(0, numberOfObjects):
            xs = outlierobjects[index]
            filter = np.isfinite(xs)
            hist, edges = np.histogram(xs[filter], bins=self.buckets, range=(self.min, self.max), density=True)
            returnData[index] = hist

        return returnData
    
class blockAccumulator:
    def __init__(self, windowSize, windowOffset):
        self.ws = windowSize
        self.wo = windowOffset
        self.init = False
        self.currentIndex = 0
        self.data = []

    def reset(self):
        self.init = False
        self.currentIndex = 0
        self.data = []
    
    # Nextlayer är ett lager från varje objekt i en lista
    def next(self, nextLayer):
        for index, objlayer in enumerate(nextLayer):
            if not self.init:
                (x, y) = np.shape(objlayer)
                self.data.append(np.empty((self.ws, x, y)))
            self.data[index][self.currentIndex % self.ws] = objlayer
        self.currentIndex += 1
        self.init = True
        if (self.currentIndex % self.wo == 0 and self.currentIndex >= self.ws):
            # Returnera de ws senaste lagren, roterat så att senaste lager ligger sist
            return [np.roll(obj, -(self.currentIndex+1 % self.ws), axis=0) for obj in self.data]
        else:
            return False


class accumulator:
    def __init__(self, windowSize, windowOffset):
        self.ws = windowSize
        self.wo = windowOffset
        self.data = collections.deque(maxlen=windowSize)
        self.currentIndex = 0

    def reset(self):
        self.data = collections.deque(maxlen=self.ws)
        self.currentIndex = 0
    
    def next(self, nextData):
        self.data.append(nextData)
        self.currentIndex += 1
        if (self.currentIndex % self.wo) == 0:
            return np.sum(self.data, axis=0)
        else:
            return False
        
    

def classify(Xtrain, Ytrain, Xtest, Ytest, n_neighbors, ):
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights="uniform", n_jobs=-1)
    scaler = StandardScaler()
    clf = Pipeline([('scaler', scaler), ('classifier', clf)])
    cvs = cross_val_score(clf, Xtrain, Ytrain, cv=5, scoring='roc_auc', n_jobs=-1)
    clf.fit(Xtrain, Ytrain)
    yfit = clf.predict_proba(Xtest)[:,1]
    return cvs.mean(), metrics.roc_auc_score(Ytest, yfit)

In [40]:
import pathlib

# Returns a list of 
def readRawTreeImages():
    objectwidth = 83
    objectheight = 122
    xspacing = 133
    yspacing = 270
    xstart = 293
    ystart = 268
    xend = 1730
    yend = 1770
    endlayer = 187

    paths = pathlib.Path('./OT data 80 um/int').glob('*.tif')
    paths_sorted = [x for x in paths]
    paths_sorted.sort()
    block = np.array([np.array(plt.imread(path)) for path in paths_sorted])
    integrals = block[0:endlayer]

    del paths_sorted

    objectCoordinates = [[x, x+objectwidth, y, y+objectheight] for y in reversed(range(
        ystart, yend, objectheight + yspacing)) for x in range(xstart, xend, xspacing + objectwidth)]
    coorddf = pd.DataFrame(objectCoordinates, columns=['xstart', 'xend', 'ystart', 'yend'])

    objects = np.full((len(coorddf), endlayer, objectheight, objectwidth), np.nan)

    for index, object in coorddf.iterrows():
        objects[index] = integrals[:, object.ystart:object.yend, object.xstart:object.xend]
    
    return objects

# objects är en 4d numpy array
def makeMask(objects, emptyRatio):
    aggregate = np.sum(objects, axis=(0,1))
    limit = np.percentile(aggregate, emptyRatio)
    mask = aggregate >= limit
    return mask

def getLabels():
    objectinfo = pd.read_csv('Parameters.csv', names=["Object", "P", "S", "H", "Porosity", "Label"])
    objectinfo.replace('GOOD', 0, inplace=True)
    objectinfo.replace('LOF', 1, inplace=True)
    return np.array(objectinfo.loc[:,"Label"])

In [41]:
from sklearn.model_selection import train_test_split
# Training cell

# High level picture
# We can batch prepare

# Start with the training
emptyRatio = 47
objectwidth = 83
objectheight = 122
layers = 187

odtype = 'scatter'
neighbourhoodDistance = 3
neighbourhoodZ = 1
windowSize = layers
windowOffset = layers
noOfBins = 20
n_neighbors = 3
# Fraction, 0-1
test_size = 0.4
hsegments = [0,26,50,74,98,122]
mask = np.zeros((objectwidth, objectheight))

allImages = readRawTreeImages()
mask = makeMask(allImages, emptyRatio)
allImages = allImages[(np.arange(len(allImages))!=20) & (np.arange(len(allImages))!=27)]
labels = getLabels()
labels = np.int16(labels[(np.arange(len(labels))!=20) & (np.arange(len(labels))!=27)])
X_train, X_test, y_train, y_test = train_test_split(allImages, labels, test_size=test_size)
X_train = allImages
y_train = labels
# trainObjects = allImages[[1,2,3,5,6,8,9,11,12,14,15,17,18,21,23,24,26], :, :, :]
# testObjects = allImages[[0,4,7,10,13,16,19,22,25], :, :, :]

# testMask = np.array([1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0])
# trainMask = testMask == 0
# trainMask[20] = False
# trainMask[27] = False
# ytrain = np.extract(trainMask, labels)
# ytest = np.extract(testMask, labels)

# Todo: Uncomment this line
# mask = makeMask(X_train, emptyRatio)

bgr = backgroundRemover(mask)
ab = blockAccumulator(windowSize, windowOffset)
oc1 = outlierCalculator(odtype, neighbourhoodDistance, neighbourhoodZ)
# a1 = accumulator(windowSize, windowOffset)


# Hur skapar vi masken? Vad är lättast?
# Dela inläsningen i två steg: en som bara delar, en som tar bort bakgrund. Skicka mask som input till dess constructor


allTrainingHists = []

# images är en lista av bildstackar?
# hur gör vi splitten här?
# - per föremål
# - per segment
# - per både våg- och horizontella segment
# Med all sannolikhet kommer vi ändra det senare, eller fokusera på att lära mellan geometrier
# lägst effort: lägg åt sidan 28*0.33 objekt som inte är med för att ha en prototyp uppe
# högst intresse: använd alla trädbilder för träning, använd alla husbilder för utvärdering, sätt automatisk gräns för
# bakgrund mha typ clustering
# medelväg: behandla bakgrundseliminering utanför streamingapproachen
# OK, steg 1: enbart träd. Ta 0.33 av dem och bara sätt åt sidan. Välj vilka själv. Så images är en lista av bilder...

# Ändra indexering ifrån [objektnummber, lager, x, y] till [lager, objektnummer, x, y]
def makesegments(image, hsegments):
    segments = []
    for obimg in image:
        for i in range(len(hsegments)-1):
            segments.append(obimg[hsegments[i]:hsegments[i+1],:])
    return segments

trainObjects = np.moveaxis(X_train, 1, 0)
testObjects = np.moveaxis(X_test, 1, 0)

allOcvals = []
allPresValues = []
i = 0
for image in trainObjects:
    # Ta bilden, omvandla till en array av bilder (en bild per obj)
    # Vilken struktur behöver preprocess? Lista med numpy array
    # image är 26x122x83
    # segments ska vara (26*5)x122x83
    i +=1 
    noback = bgr.removeBackground(image)
    segments = makesegments(noback, hsegments)
    segmentblocks = ab.next(segments)
    if(segmentblocks is False):
        continue
    # plt.hist(trainoutliers[0][1].flatten(), bins=bincount, range=(minval, maxval))
    ocvals = oc1.calculate(segmentblocks)
    allOcvals.append(ocvals)
    # if(i == 0 or i == 1 or i == 2):
    #     plt.imshow(pres[0])
    #     plt.figure()
    #     plt.hist(pres[0].flatten())
    #     plt.yscale('log')
    #     plt.figure()
    # ocVals: List[layers][objects][matrix]
ocmin = np.nanmin(np.concatenate([np.concatenate([o.flatten() for o in oo]) for oo in allOcvals]))
ocmax = np.nanmax(np.concatenate([np.concatenate([o.flatten() for o in oo]) for oo in allOcvals]))
e1 = encoder(noOfBins, ocmin, ocmax)

for ocvals in allOcvals:
    if len(ocvals) > 0:
        # Print a chart here
        hists = e1.encode(ocvals)
        for acc in hists:
            allTrainingHists.append(acc)

In [46]:
(ocmin, ocmax)

(-10754.713146378836, 10105.027903301081)

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.87809186e-08,
       2.32883390e-06, 2.70464008e-04, 5.89345224e-04, 9.06367129e-05,
       5.54037097e-06, 4.13180208e-07, 3.75618371e-08, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [47]:
repetitions = len(allTrainingHists) / len(y_train)
knn = neighbors.KNeighborsClassifier(n_neighbors, weights="uniform", n_jobs=-1)
scaler = StandardScaler()
clf = Pipeline([('scaler', scaler), ('classifier', knn)])
ytrain2 = np.tile(y_train.astype('int'), [np.int32(repetitions)])
clf.fit(allTrainingHists, ytrain2)

In [48]:
# Testing cell
from sklearn.metrics import f1_score, roc_auc_score
oc1.reset()
y_test = y_test.astype('int')

testx = []
predictions = []
letsPrint = False
for image in testObjects:
    # Start of timing
    noback = bgr.removeBackground(image)
    ocvals = oc1.calculate(pres)
    if len(ocvals) > 0:
        if(letsPrint):
            plt.imshow(image[0])
            plt.title("background 0")
            plt.figure()
            plt.imshow(image[1])
            plt.title("background 1")
            plt.figure()
            plt.imshow(noback[0])
            plt.title("nobackground 0")
            plt.figure()
            plt.imshow(noback[1])
            plt.title("nobackground 1")
            plt.figure()
            plt.imshow(pres[0])
            plt.title("pres 0")
            plt.figure()
            plt.imshow(pres[1])
            plt.title("pres 1")
            plt.figure()
            plt.imshow(ocvals[0])
            plt.title("ocvals 0")
            plt.figure()
            plt.imshow(ocvals[1])
            plt.title("ocvals 1")
            plt.figure()
            plt.hist(ocvals[0].flatten(), bins=noOfBins, range=(ocmin, ocmax))
            plt.yscale('log')
            plt.figure()
            plt.hist(ocvals[1].flatten(), bins=noOfBins, range=(ocmin, ocmax))
            plt.yscale('log')
            plt.figure()
            break
        hists = e1.encode(ocvals)
        pred = clf.predict_proba(hists)[:,1]
        # End of timing
        testx.append(hists)
        predictions.append(pred)

# investigate accuracy etc here. Potentially batch it based on layer
ysvalue = []
for index, o in enumerate(predictions):
    print(roc_auc_score(y_test, o))
    ysvalue.append(roc_auc_score(y_test, o))
plt.plot(np.arange(0, len(ysvalue)), ysvalue)

NameError: name 'a1' is not defined

In [55]:
print(predictions)

[array([0.33333333, 0.        , 0.33333333, 0.        , 0.33333333,
       0.        , 0.33333333, 0.33333333, 0.        , 0.        ,
       0.        ]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0.        , 0.        , 0.        , 0.        , 0.33333333,
       0.        , 0.        , 0.33333333, 0.        , 0.        ,
       0.        ]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0.        , 0.        , 0.        , 0.        , 0.33333333,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), arra

: 