In [4]:
import csv
import sys
import warnings
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering

warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

In [5]:
def get_values(df):
    columns_idx = list(df.iloc[:,3])
    top_left = []
    for vec in list(df.iloc[:,1]):
        vec = vec.replace('[', '').replace(']', '').replace(',', '')
        num_list = vec.split()
        numbers = [int(num) for num in num_list]
        top_left.append(numbers)
    bottom_right = []
    for vec in list(df.iloc[:,2]):
        vec = vec.replace('[', '').replace(']', '').replace(',', '')
        num_list = vec.split()
        numbers = [int(num) for num in num_list]
        bottom_right.append(numbers)
    return columns_idx, top_left, bottom_right

def get_edge_centres(top_left, bottom_right):
    X = []
    top = []
    left = []
    bottom = []
    right = []
    for vec1, vec2 in zip(top_left, bottom_right):
        avx = (vec1[0]+vec2[0]) / 2
        avy = (vec1[1]+vec2[1]) / 2
        X.append([avx, avy])
        top.append([avx, vec1[1]])
        left.append([vec1[0], avy])
        bottom.append([avx, vec2[1]])
        right.append([vec2[0], avy])
    return X, top, left, bottom, right

def euclidean(a, b):
    return np.sqrt(np.sum((np.array(a)-np.array(b))**2))

def get_neighbours(arr, arr2, cluster):
    result = []
    threshold = 50
    for i, vec1 in enumerate(arr):
        dist, idx = 1e6, 0
        for j, vec2 in enumerate(arr2):
            if cluster[i] == cluster[j]:
                new_dist = euclidean(vec1, vec2)
                if new_dist < dist:
                    dist = new_dist
                    idx = j
        if dist>threshold: result.append([-1, 0])
        else: result.append([round(dist, 2), idx])
    return result

In [6]:
df = pd.read_csv('4/SMAI-AQ2/CSV/TLBR_159.csv')
columns_id, top_left, bottom_right = get_values(df)
X, top, left, bottom, right = get_edge_centres(top_left, bottom_right)

agglomerative = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
agglomerative.fit(X)
clusters = agglomerative.labels_

top_box = get_neighbours(top, bottom, clusters)
bottom_box = get_neighbours(bottom, top, clusters)
left_box = get_neighbours(left, right, clusters)
right_box = get_neighbours(right, left, clusters)

data = list(zip(columns_id, top_left, bottom_right, top, left, bottom, right, top_box, bottom_box, left_box, right_box))
header = ['columns Id', 'Top-Left', 'Bottom-Right', 'Top', 'Left', 'Bottom', 'Right', 'Top_box', 'Bottom_box', 'Left_box', 'Right_box']

file = 'output.csv'
with open(file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(data)