In [1]:
import pickle

import skimage
import skimage.io as io
import scipy.ndimage as ndi
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn import cluster
from sklearn import metrics
from scipy.optimize import minimize_scalar
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
import threading

from scipy.spatial.distance import pdist

import optuna
import time

from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 60em; }</style>"))

SCALE = False
TEST_SIZE = 500

EPS_MIN = 0.01
EPS_MAX = 4
TIMEOUT = 120

Q = None
EPS = None


ic = io.imread_collection('data/*.png', conserve_memory=False)
print(len(ic))
#io.imshow_collection(ic)

7618


In [2]:
#For test
correct_df = pd.read_csv('data/@0CLUSTERING.csv')
print(correct_df)

correct = list(correct_df["Cluster"])

#correct

           Filename      Cluster
0       1_18-41.png        1like
1       1_18-96.png  @toolong129
2      1_18-174.png            T
3      1_19-204.png            S
4      1_19-229.png            r
...             ...          ...
7613  7_521-166.png            r
7614  7_521-241.png            S
7615  7_522-189.png            t
7616   7_523-19.png       0PLAMY
7617   7_523-95.png       0PLAMY

[7618 rows x 2 columns]


In [None]:
initial_df = pd.DataFrame(dtype=object)
initial_df[0] = ic
initial_df[1] = correct

initial_df[0] = pd.DataFrame(initial_df.apply(lambda row: skimage.color.rgb2gray(row[0]), axis=1))

In [None]:
def data_prep(df, scale):
    #print(df.apply(lambda row: ndi.sum_labels(row[0]),axis=1))
    max_sum = max(df.apply(lambda row: ndi.sum_labels(row[0]),axis=1))
    #print(max_sum)
    
    def get_max_shape(df):
        shape_y = max(df.apply(lambda row: row[0].shape[0], axis=1))
        shape_x = max(df.apply(lambda row: row[0].shape[1], axis=1))
        return shape_y, shape_x
    
    if scale:
        def img_scale(img):
            my_sum = ndi.sum_labels(img)
            return ndi.zoom(img, np.sqrt(max_sum/my_sum), cval=1.)

        df[0] = df.apply(lambda row: img_scale(row[0]), axis=1)
        #print(df.apply(lambda row: ndi.sum_labels(row[0]),axis=1))

    def img_pre(img):
        cy, cx = ndi.center_of_mass(img)
        cy = round(cy)
        cx = round(cx)
        sy, sx = img.shape
        top = max(sy - 1 - cy - cy, 0)
        bot = max(cy - (sy - 1 - cy), 0)
        left = max(sx - 1 - cx - cx, 0)
        right = max(cx - (sx - 1 - cx), 0)
        return cv2.copyMakeBorder(img, top, bot, left, right, cv2.BORDER_CONSTANT, None, value=1.)

    df[0] = df.apply(lambda row: img_pre(row[0]), axis=1)
    #print(get_max_shape(df))

    max_shape_y, max_shape_x = get_max_shape(df)

    def img_post(img):
        sy, sx = img.shape
        top = (max_shape_y - sy) // 2
        bot = (max_shape_y - sy + 1) // 2
        left = (max_shape_x - sx) // 2
        right = (max_shape_x - sx + 1) // 2
        return cv2.copyMakeBorder(img, top, bot, left, right, cv2.BORDER_CONSTANT, None, value=1.)

    df[0] = df.apply(lambda row: img_post(row[0]), axis=1)
    #print(get_max_shape(df))
    
    df[0] = df.apply(lambda row: np.reshape(row[0], -1), axis=1)

data_prep(initial_df, SCALE)



In [None]:
def get_subsets(df):
    ind = list(df.index)
    np.random.shuffle(ind)
    ind1 = ind[:TEST_SIZE]
    ind2 = ind[len(df) - TEST_SIZE:]
    np.random.shuffle(ind1)
    np.random.shuffle(ind2)
    return [pd.DataFrame(df, index=ind1, copy=False),
            pd.DataFrame(df, index=ind2, copy=False)]

def get_subset(df):
    sub, _ = get_subsets(df)
    return sub

In [None]:
def clustering(eps, q, data):
    print(f"started clustering({eps}, {q})")
    if np.isclose(q, 0):
        p = np.inf
    else:
        p = 1 / q
    
    if np.isclose(p, 2):
        p = 2
    
    start = time.time()
    _, labels = cluster.dbscan(data, eps=eps, p=p, min_samples=1)
    stop = time.time()
    print(f"clustering({eps}, {q}) finished, time: {stop - start}")
    #dbscan.fit(data)
    return labels

In [None]:
def show_plot(data_x, y):
    if not SCALE:
        pca = PCA(n_components = 2)
        data2D = pca.fit_transform(data_x)
        print(f"nr of classes: {len(set(y))}")
        fig = px.scatter(x=data2D[:, 0], y=data2D[:, 1], color=[str(v) for v in y], width=900, height=600)
        fig.show()
    else:
        print(f"{SCALE=}, plot not shown")

In [None]:
def objective(eps, q):
    acc = []
    for test_df in get_subsets(initial_df):
        data = list(test_df[0])
        correct = list(test_df[1])
        res = clustering(eps, q, data)
        acc.append(metrics.adjusted_rand_score(correct, res))

    with open("{}.pickle".format*trial.number, "wb)") as fout:
        pickle.dump(res, fout)
    return np.mean(acc)

def show_objective_plot(up_bound, num):
    x = np.linspace(0.001, up_bound, num=num)
    objective_vec = np.vectorize(objective)
    y = objective_vec(x)
    fig = px.scatter(x=x, y=y, width=900, height=600)
    fig.show()

#show_objective_plot(4, 100)

In [None]:
def check_clustering(eps, q):
    test_df = get_subset(initial_df)
    data = list(test_df[0])
    correct = list(test_df[1])
    res = clustering(eps, q, data)
    acc = metrics.adjusted_rand_score(correct, res)
    
    print("correct clustering")
    show_plot(data, correct)
    print("\n")
    
    print(f"found clustering")
    print(f"eps: {float(eps)}")
    print(f"q: {float(q)}")
    print(f"ACCURACY: {metrics.rand_score(correct, res)}")
    print(f"BALANCE ACCURACY: {acc}")
    
    acc1 = objective(eps, q)
    
    print(f"AVERAGE BALANCED ACCURACY: {np.mean([acc1, acc1, acc])}")
    
    show_plot(data, res)


In [None]:
#test
#%time
#_, labels = cluster.dbscan(list(initial_df.loc[:10, 0]), eps=1, p=2, min_samples=1)


In [None]:
#check_clustering(2.3, 1/3) #test

In [None]:
def show_all_data():
    print("all data")
    show_plot(list(initial_df[0]), list(initial_df[1]))

show_all_data()

In [None]:
def objective_optuna(trial):
    eps = trial.suggest_float("eps", EPS_MIN, EPS_MAX)
    q = trial.suggest_float("q", 0, 1)
    return objective(eps, q)

In [None]:
def get_best_params_optuna(timeout):
    study = optuna.create_study(direction="maximize")
    
    fixed_params = {key: val for key, val in [("q", Q), ("eps", EPS)] if val is not None}
    study.sampler = optuna.samplers.PartialFixedSampler(fixed_params, study.sampler)
    
    study.optimize(objective_optuna, timeout=timeout)
    optuna.visualization.plot_slice(study).show()
    optuna.visualization.plot_contour(study, params=["q", "eps"]).show()
    optuna.visualization.plot_param_importances(study).show()
    return study.best_trial.params["eps"], study.best_trial.params["q"]

In [None]:
#print(initial_df)
def get_best_params():
    return get_best_params_optuna(TIMEOUT)

best_eps, best_q = get_best_params()


In [None]:
def create_study():
    study = optuna.create_study(direction="maximize", study_name="ml_clustering", storage=)

    fixed_params = {key: val for key, val in [("q", Q), ("eps", EPS)] if val is not None}
    study.sampler = optuna.samplers.PartialFixedSampler(fixed_params, study.sampler)
    return

In [None]:
def thread_optimization(study, timeout=TIMEOUT):
    fixed_params = {key: val for key, val in [("q", Q), ("eps", EPS)] if val is not None}
    study.sampler = optuna.samplers.PartialFixedSampler(fixed_params, study.sampler)

    study.optimize(objective_optuna, timeout=timeout)



In [None]:
for i in CPU:
    proc = threading.Thread(target=thread_optimization, args=[study]]


In [None]:
#from scipy.optimize import shgo, dual_annealing, differential_evolution, basinhopping

#for f in [dual_annealing, differential_evolution]:
#    res = f(lambda eps: -objective(eps), [(1.5, 3.5)], maxiter=100)
#    print(res)


In [None]:
check_clustering(best_eps, best_q)
