In [None]:
import skimage
import skimage.io as io
import scipy.ndimage as ndi
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn import metrics
from scipy.optimize import minimize_scalar
import cv2
from sklearn.model_selection import train_test_split

from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 60em; }</style>"))

SCALE = False
TEST_SIZE = 5000

ic = io.imread_collection('data/*.png', conserve_memory=False)
print(len(ic))
#io.imshow_collection(ic)

In [None]:
#For test
correct_df = pd.read_csv('data/@0CLUSTERING.csv')
print(correct_df)

correct = list(correct_df["Cluster"])

#correct

In [None]:
initial_df = pd.DataFrame(dtype=object)
initial_df[0] = ic
initial_df[1] = correct

initial_df[0] = pd.DataFrame(initial_df.apply(lambda row: skimage.color.rgb2gray(row[0]), axis=1))

In [None]:
def data_prep(df, scale):
    #print(df.apply(lambda row: ndi.sum_labels(row[0]),axis=1))
    max_sum = max(df.apply(lambda row: ndi.sum_labels(row[0]),axis=1))
    #print(max_sum)
    
    def get_max_shape(df):
        shape_y = max(df.apply(lambda row: row[0].shape[0], axis=1))
        shape_x = max(df.apply(lambda row: row[0].shape[1], axis=1))
        return shape_y, shape_x
    
    if scale:
        def img_scale(img):
            my_sum = ndi.sum_labels(img)
            return ndi.zoom(img, np.sqrt(max_sum/my_sum), cval=1.)

        df[0] = df.apply(lambda row: img_scale(row[0]), axis=1)
        #print(df.apply(lambda row: ndi.sum_labels(row[0]),axis=1))

    def img_pre(img):
        cy, cx = ndi.center_of_mass(img)
        cy = round(cy)
        cx = round(cx)
        sy, sx = img.shape
        top = max(sy - 1 - cy - cy, 0)
        bot = max(cy - (sy - 1 - cy), 0)
        left = max(sx - 1 - cx - cx, 0)
        right = max(cx - (sx - 1 - cx), 0)
        return cv2.copyMakeBorder(img, top, bot, left, right, cv2.BORDER_CONSTANT, None, value=1.)

    df[0] = df.apply(lambda row: img_pre(row[0]), axis=1)
    #print(get_max_shape(df))

    max_shape_y, max_shape_x = get_max_shape(df)

    def img_post(img):
        sy, sx = img.shape
        top = (max_shape_y - sy) // 2
        bot = (max_shape_y - sy + 1) // 2
        left = (max_shape_x - sx) // 2
        right = (max_shape_x - sx + 1) // 2
        return cv2.copyMakeBorder(img, top, bot, left, right, cv2.BORDER_CONSTANT, None, value=1.)

    df[0] = df.apply(lambda row: img_post(row[0]), axis=1)
    #print(get_max_shape(df))
    
    df[0] = df.apply(lambda row: np.reshape(row[0], -1), axis=1)

data_prep(initial_df, SCALE)



In [None]:
def get_subsets(df):
    ind = list(df.index)
    np.random.shuffle(ind)
    ind1 = ind[:TEST_SIZE]
    ind2 = ind[len(df) - TEST_SIZE:]
    np.random.shuffle(ind1)
    np.random.shuffle(ind2)
    return [pd.DataFrame(df, index=ind1, copy=False),
            pd.DataFrame(df, index=ind2, copy=False)]

def get_subset(df):
    sub, _ = get_subsets(df)
    return sub

In [None]:
def clustering(eps, data):
    dbscan = DBSCAN(eps=eps, min_samples=1)
    dbscan.fit(data)
    return dbscan.labels_

In [None]:
def show_plot(data_x, y):
    if not SCALE:
        pca = PCA(n_components = 2)
        data2D = pca.fit_transform(data_x)
        print(f"nr of classes: {len(set(y))}")
        fig = px.scatter(x=data2D[:, 0], y=data2D[:, 1], color=[str(v) for v in y], width=900, height=600)
        fig.show()
    else:
        print(f"{SCALE=}, plot not shown")

In [None]:
def objective(eps):
    acc = []
    for test_df in get_subsets(initial_df):
        data = list(test_df[0])
        correct = list(test_df[1])
        res = clustering(eps, data)
        acc.append(metrics.adjusted_rand_score(correct, res))
    return np.mean(acc)

def show_objective_plot(up_bound, num):
    x = np.linspace(0.001, up_bound, num=num)
    objective_vec = np.vectorize(objective)
    y = objective_vec(x)
    fig = px.scatter(x=x, y=y, width=900, height=600)
    fig.show()

show_objective_plot(4, 100)


In [None]:
def check_clustering(eps):
    test_df = get_subset(initial_df)
    data = list(test_df[0])
    correct = list(test_df[1])
    res = clustering(eps, data)
    acc = metrics.adjusted_rand_score(correct, res)
    
    print("correct clustering")
    show_plot(data, correct)
    print("\n")
    
    print(f"found clustering")
    print(f"eps: {float(eps)}")
    print(f"ACCURACY: {metrics.rand_score(correct, res)}")
    print(f"BALANCE ACCURACY: {acc}")
    
    acc1, acc2 = objective(eps), objective(eps)
    
    print(f"AVERAGE BALANCED ACCURACY: {np.mean(acc1, acc1, acc2, acc2, acc)})
    
    show_plot(data, res)


In [None]:
#print(initial_df)
def get_best_eps():
    res = minimize_scalar(lambda eps: -objective(eps), method="bounded", bounds=(0.01, 4))
    return res.x
#print(res)

best_eps = get_best_eps()


In [None]:
#from scipy.optimize import shgo, dual_annealing, differential_evolution, basinhopping

#for f in [dual_annealing, differential_evolution]:
#    res = f(lambda eps: -objective(eps), [(1.5, 3.5)], maxiter=100)
#    print(res)


In [None]:
check_clustering(best_eps)


In [None]:
print(f"old balanced acc: 0.8369728801063364")