In [None]:
# Experiments to run
# 1: Show the problem in treating images as independent
# 1.1 Use the C set. Mix the images randomly, establish a training/validation/test set (60%/20%/20%)
# Show that just picking images randomly is a bad idea, because our algorithm will learn to generalise
# from the object/location to the class, instead of interesting attributes
# 1.2 Use the C set. Establish a training/validation/test set (60%/20%/20%), but assign objects instead of images

# 2: Show the effect on generalisation between geometries/builds
# 2.1 Same as 1.2
# 2.2 Same as 2.1, but include the A and B set in the training data

# 3: Show the effect of using corners (or not)
# 3.1 Same as 1.2
# 3.2 Use corners, show the result is worse. There is a need to account for the location of the outlier - outliers near the edge are expected.
# Can we do some form of higher-order outlier detection? (e.g. if there are 2 outliers, and they are close to each other, they are probably not outliers)
# Realistically, because of the size of the edges that may be tricky. 
# Before making grandiouse conclusions, validate that the outlier value image makes sense and that it's not some bug or artithmetic error because of my method.
# Never assume the code works :)

# 4: For the best performing setting, show the execution time/accuracy tradeoff and do the cost calculations

# Each cell should (ideally) be self-contained, to facilitate re-running parts of the experiments

In [None]:
# 1.1
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
import time
# Parameters
threshold = 10000 # determine this...
windowSize = 1
windowAdvance = 1
odtype = 'moran'
neighbourhoodDistance = 1
neighbourhoodZ = 1
emptyRatioB = 30

# Hyperparameters
neighbourhoodDistances = [1, 3, 5]
binOptions = [10, 20, 40]

# Contstants
bend = 225
objectwidth = 100
objectheight = 100
twindowSize = windowSize
twindowAdvance = windowAdvance

%run classes.ipynb

X = readCSet().reshape(-1, objectwidth, objectheight)
labelRanges = [(i, i+twindowSize) for i in range(0, bend-twindowSize+1, twindowAdvance)]
y = [getLabelsC(objIndex, 0.5, start, end) for (start, end) in labelRanges for objIndex in range(1,26)]

# Show images, ensure they look all right
plt.imshow(X[0])
plt.figure()
plt.imshow(X[-1])
plt.figure()
# Do the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)
skf = StratifiedKFold(n_splits=5)
# KNN, random forest, DT, LR?
for nd in neighbourhoodDistances:
    for bo in binOptions:
        # Todo: add more hyperparameters
        cvalscores = []
        for train, validation in skf.split(X, y):
            crosstrainX = X_train[train]
            crossvalX = X_train[validation]
            crosstrainY = y_train[train]
            crossvalY = y_train[validation]
            # Train with a bunch of different hyperparameters, with cross-validation on 60% of the data
            calc_threshold = np.percentile(crosstrainX, 30)
            bgr = backgroundRemover2(calc_threshold)
            #ab = blockAccumulator(windowSize, windowAdvance)
            oc1 = outlierCalculator(, neighbourhoodDistances, neighbourhoodZ)
            [crosstrainX]
            # Så, anpassa implementeringen till att ta enstaka bilder istället. basically:
            # Beräkna Ocval för alla bilder i träningssetet
            # Räkna ut min och max för ocval
            # Skapa histogrammen för träningssetet
            # Kör ML grejer (kan effektivisera genom att bara iterera över deras hyperparametrar här)
            # Lägg resultatet från folden i cvalscores (den kommer behöva utökas för att ta med fler ML modeller...)
        # Räkna ut genomsnittlig performance för uppsättningen hyperparametrar, spara ned.


# Ta den bästa uppsättningen hyperparametrar för varje modell, kör på testsetet


def test(windowSize, windowAdvance):
    twindowSize = windowSize
    twindowAdvance = windowAdvance

    oc1.reset()
    ab = blockAccumulator(twindowSize, twindowAdvance)

    # X_test = readRawHouseImages()
    testObjects = np.moveaxis(X_test, 1, 0)
    # shape of testObjects is layers x objects x width x height
    maskB = makeMaskB(X_test, emptyRatioB)
    bgrb = backgroundRemover(maskB)

    testx = []
    predictions = []
    times = []
    letsPrint = False
    i = 0
    for image in testObjects:
        # Start of timing
        i += 1
        start_time = time.time()
        noback = bgrb.removeBackground(image)
        segmentblocks = ab.next(noback)
        if(segmentblocks is False):
            continue
        ocvals = oc1.calculate(segmentblocks)
        hists = e1.encode(ocvals)
        pred = clf.predict_proba(hists)[:,1]
        times.append(time.time() - start_time)
        # End of timing
        testx.append(hists)
        predictions.append(pred)
        # %run classes.ipynb

    # investigate accuracy etc here. Potentially batch it based on layer
    # y_test = [getLabelsB(objIndex, 0.5, start, end) for (start, end) in labelRanges for objIndex in range(1,29)]
    ysvalue = []
    for index, o in enumerate(predictions):
        ysvalue.append(roc_auc_score(y_test[index*25:(index+1)*25], o))
    return ysvalue, times, predictions, y_test