# Optimizer 2022 R43 Solutions

### imports

In [1]:
%matplotlib inline
import math
import matplotlib.pyplot as plt
import seaborn as sns
import random
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import PCA
from sympy import symbols, solve
import subprocess
from tqdm import tqdm
import networkx as nx
import networkx.algorithms.community as nx_comm

### useful functions

In [4]:
def factorial(n):
    if n == 1:
        return 1
    return factorial(n-1)*n


def combinations(n, k):
    return factorial(n)/(factorial(n-k)*factorial(k))


class Point:
    def __init__(self, label, coords, deg=0, adj_list=None):
        if adj_list is None:
            adj_list = set()
        self.label = label
        self.coords = coords
        self.deg = deg
        self.adj_list = adj_list
        
    def __repr__(self):
        return "point: %s\ndeg: %s" % (str(self.label), str(self.deg))


class Network:
    def __init__(self, nnodes=0, node_dict=None, links=None):
        if node_dict is None:
            node_dict = dict()
        if links is None:
            links = set()
        self.nnodes = nnodes
        self.node_dict = node_dict
        self.links = links
    
    
    
    
    def louvain(self, edges, resolution):
        G = nx.Graph()
        for i in range(1, self.nnodes+1):
            G.add_node(i)
        G.add_weighted_edges_from(edges)
        components = nx_comm.louvain_communities(G, resolution=resolution)
        return components
    
    
    
    def modularity(self):
        self.net_out('dfs_in.txt')
        cpp_run('./dfs')
        lb = dfs_read('./dfs_out.txt')
        nc = len(np.unique(np.asarray(lb)))
        components = [[] for i in range(nc)]
        for i in range(1, self.nnodes+1):
            components[lb[i-1]-1].append(i)
        M = 0
        for i in range(nc):
            kc = 0
            for j in components[i]:
                kc += self.node_dict[j].deg
            lc = kc/2
            L = len(self.links)
            mc = (lc/L)-(kc/2*L)**2
            M += mc
        return M
        
    
    def add_node(self, node):
        self.nnodes = self.nnodes + 1
        self.node_dict[node.label] = node


    def add_undir_link(self, node1, node2):
        self.node_dict[node1].adj_list.add(node2)
        self.node_dict[node1].deg += 1       
        self.node_dict[node2].adj_list.add(node1)
        self.node_dict[node2].deg += 1
        self.links.add((node1, node2))
              
    
    def find_distance(self):
        mx = 0
        mn = math.inf
        distances = []
        for i in range(1, self.nnodes+1):
            for j in range(i, self.nnodes+1):
                d = math.dist(self.node_dict[i].coords, self.node_dict[j].coords)
                distances.append(d)
                if d > mx:
                    mx = d
                if d < mn:
                    mn = d
        return mx, mn, np.mean(np.asarray(distances)), np.std(np.asarray(distances))
     
    
    def create_epsilon_neighbour_graph(self, epsilon):
        for i in range(1, self.nnodes+1):
            for j in range(i+1, self.nnodes+1):
                d = math.dist(self.node_dict[i].coords, self.node_dict[j].coords)
                if d < epsilon:
                    self.add_undir_link(i, j)
    
    
    def create_k_neighbour_graph(self, k):
        for i in range(1, self.nnodes+1):
            nodes = []
            for j in range(1, self.nnodes+1):
                d = dist(np.asarray(self.node_dict[i].coords), np.asarray(self.node_dict[j].coords))
                nodes.append((j, d))
            nodes.sort(key=lambda x:x[1])
            for j in range(1, k+1):
                self.add_undir_link(i, nodes[k][0])
    
    
    def create_complete_graph(self):
        for i in range(1, self.nnodes+1):
            for j in range(i+1, self.nnodes+1):
                self.add_undir_link(i, j)
                
    
    def find_minmax(self):
        mx = 0
        mn = math.inf
        for link in self.links:
            node1 = link[0]
            node2 = link[1]
            d = math.dist(self.node_dict[node1].coords, self.node_dict[node2].coords)
            if d > mx:
                mx = d
            if d < mn:
                mn = d
        return mx, mn
                  
    
    def add_edge_weights(self):
        edges = []
        mx, mn = self.find_minmax()
        for link in self.links:
            node1 = link[0]
            node2 = link[1]
            d = math.dist(self.node_dict[node1].coords, self.node_dict[node2].coords)
            edges.append((node1, node2, (d-mn)/(mx-mn)))
        edges.sort(key=lambda x:x[2])
        return edges
     
    
    def add_edge_distance(self):
        edges = []
        for link in self.links:
            node1 = link[0]
            node2 = link[1]
            d = math.dist(self.node_dict[node1].coords, self.node_dict[node2].coords)
            edges.append((node1, node2, d))
        edges.sort(key=lambda x:x[2])
        return edges
    
    
    def edge_list_out(self, file_path):
        edges = self.add_edge_distance()
        with open(file_path, 'w') as f:
            f.write("%s\n" % str(len(edges)))
            for edge in edges:
                node1 = edge[0]
                node2 = edge[1]
                weight = edge[2]
                f.write("%s %s %s\n" %(str(node1), str(node2), str(weight)))
    
    
    def net_out(self, file_path):
        with open(file_path, 'w') as f:
            f.write("%s\n" % str(self.nnodes))
            for k in self.node_dict.keys():
                f.write("%s " % str(self.node_dict[k].deg))
                f.write(" ".join(str(e) for e in self.node_dict[k].adj_list))
                f.write("\n")
    
       
def read_input(input_dir):
    point_list = []
    with open(input_dir, 'r') as f:
        lines = f.readlines()
        d, n, m, k, rho = lines[0].strip().split()
        k_i = lines[1].strip().split()
        for line in lines[2:]:
            new = np.float32(line.strip().split())
            point_list.append(list(new))
    return [int(d), int(n) , m, int(k), int(rho), [int(e) for e in k_i]], create_df(point_list)


def show_input_params(input_params):
    print("d: %i" % input_params[0])
    print("n: %i" % input_params[1])
    print("m: %s" % input_params[2])
    print("k: %i" % input_params[3])
    print("k_i: ", end="")
    print(" ".join(str(e) for e in input_params[5]))
    print("rho: %i" % input_params[4])
    return
    

def gng(points, max_iter, epsilon_b, epsilon_n, a_max, lambda_, alpha, D):
    w = []
    w_ab = random.sample(points, 2)
    for i in w_ab:
        w.append(i)
    it = 0
    while it < max_iter:
        x = random.sample(points, 1)
        x = x[0]
        min_dist = math.inf
        min_dist2 = math.inf
        w_s1_idx = -1
        w_s2_idx = -1
        w_s1 = 'a'
        w_s2 = 'a'
        for i in range(len(w)):
            d = math.dist(w[i].coords, x.coords)
            if d < min_dist:
                min_dist2 = min_dist
                w_s2_idx = w_s1_idx
                w_s2 = w_s1
                min_dist = d
                w_s1_idx = i
                w_s1 = w[i]
            elif d < min_dist2:
                min_dist2 = d
                w_s2_idx = i
                w_s2 = w[i]
        w[w_s1_idx].error += math.dist(w_s1.coords, x.coords)**2
        w[w_s1_idx].coords = tuple(np.asarray(w_s1.coords) + epsilon_b*(np.asarray(x.coords)-np.asarray(w_s1.coords)))
        for i in w[w_s1_idx].adj_list:
            w[i[0]].coords = tuple(np.asarray(w[i[0]].coords)+epsilon_n*(np.asarray(x.coords)-np.asarray(w[i[0]].coords)))
        if w_s2_idx not in w[w_s1_idx].adj_list:
            w[w_s1_idx].adj_list.append([w_s2_idx, 0])
            w[w_s2_idx].adj_list.append([w_s1_idx, 0])
            w[w_s1_idx].deg += 1
            w[w_s2_idx].deg += 1
        for p in w:
            i = 0
            while i < len(p.adj_list):
                if p.adj_list[i][1] > a_max:
                    del p.adj_list[i]
                i += 1
        i = 0
        while i < len(w):
            if w[i].deg == 0:
                del w[i]
            i += 1
        if it % lambda_ == 0:
            w_q_idx = -1
            max_err = 0
            for i in range(len(w)):
                if w[i].error < max_err:
                    max_err = w[i].error
                    w_q_idx = i
            w_r_idx = -1
            max_err = 0
            for p in w[w_q_idx].adj_list:
                if w[p[0]].error > max_err:
                    max_err = w[p[0]].error
                    w_r_idx = p[0]
            w_s_coords = tuple((np.asarray(w[w_q_idx].coords)+np.asarray(w[w_r_idx].coords))/2)
            w.append(Point(label=len(w)+1, coords=w_s_coords))
            w[-1].adj_list.append([w_q_idx, 0])
            w[-1].adj_list.append([w_r_idx, 0])
            w[1].deg += 2
            w[w_r_idx].adj_list.append([len(w)-1, 0])
            w[w_q_idx].adj_list.append([len(w)-1, 0])
            w[w_r_idx].deg += 1
            w[w_q_idx].deg += 1
            i = 0
            while i < len(w[w_q_idx].adj_list):
                if w[w_q_idx].adj_list[i][0] == w_r_idx:
                    del w[w_q_idx].adj_list[i]
                    break
                i += 1
            i = 0
            while i < len(w[w_r_idx].adj_list):
                if w[w_r_idx].adj_list[i][0] == w_q_idx:
                    del w[w_q_idx].adj_list[i]
                    break
                i += 1
            w[w_q_idx].error *= alpha
            w[w_r_idx].error *= alpha
            w[-1].error = w[w_q_idx].error

            for p in w:
                p.error *= D

#             if termination:
#                 break
        it += 1
    return w


def vector_quantization(method, points, n_output, miter):
    if method.lower() == 'kmeans' or method.lower() == 'k-means':
        k = KMeans(n_clusters=n_output, max_iter=miter)
        k.fit(points)
        centers = k.cluster_centers_
        labels = k.labels_
        return centers, labels
    
    
    
def vector_quantization_mini(method, points, n_output, miter):
    if method.lower() == 'minibatchkmeans' or method.lower() == 'mini-batch-kmeans':
        k = MiniBatchKMeans(n_clusters=n_output, batch_size=3072, max_iter=miter)
        k.fit(points)
        centers = k.cluster_centers_
        labels = k.labels_
        return centers, labels


def dimensionality_reduction(points, old_dim, new_dim):
    pca = PCA(n_components=new_dim)
    new_points = pca.fit_transform(X=points)
    return create_df(new_points)


def kmeans_clustering(points, k):
    k = KMeans(n_clusters=k)
    k.fit(points)
    centers = k.cluster_centers_
    labels = k.labels_
    return centers, labels


def attach_label_to_point(points, labels):
    df = create_df(points)
    df['label'] = labels
    return df.values


def quantized_to_origianl(org_df, qdf):
    final_labels = []
    for i in range(len(labels)):
        final_labels.append(clustered_quantized_points[labels[i], clustered_quantized_points.shape[1]-1])
    clustered = np.hstack(points, final_labels)
    return clustered
    
    
def points_to_centers_map(points, centers):
    labels = []
    for i in points:
        mn = math.inf
        l = -1
        for j in range(len(centers)):
            d = math.dist(i, centers[j])
            if d < mn:
                mn = d
                l = j
        labels.append(l)
    return labels


def lower_dimension(cluster:np.ndarray, dim, error):
    points = []
    for i in range(dim):
        points.append(cluster[i])
    A = np.asarray(points)
    b = np.ones((dim,))
    coefs = np.linalg.inv(A).dot(b)
    rnd = random.choice(cluster[dim:])
    if abs(coefs.dot(rnd) - 1) < error:
        return True, coefs/np.linalg.norm(coefs), rnd.dot(coefs/np.linalg.norm(coefs))
    return False, coefs/np.linalg.norm(coefs), rnd.dot(coefs/np.linalg.norm(coefs))


def lower_dimension_2(cluster:np.ndarray, dim, error):
    x = symbols('x')
    p1 = cluster[dim+10]
    p2 = cluster[dim+11]
    vec1 = p1 - p2
    vec2 = np.ones((dim-1,))
    vec2 = np.append(vec2, x)
    f = vec1.dot(vec2)
    s = solve(f)
    nor = np.ones((dim-1,))
    nor = np.append(nor, np.asarray(s))
    D = nor.dot(p1)
    rnd = random.choice(cluster[dim:])
    if abs(nor.dot(rnd) - D) < error:
        return True, nor, D
    return False, nor, D
    

def find_center_radius(cluster):
    diameter = 0
    for i in range(cluster.shape[0]):
        for j in range(i+1, cluster.shape[0]):
            p1 = cluster[i]
            p2 = cluster[j]
            d = math.dist(p1, p2)
            if d > diameter:
                diameter = d
                center = (p1+p2)/2
    return center, diameter/2


def index_list(df, k):
    total = [[] for i in range(k)]
    for i in range(len(df)):
        total[df['label'].iloc[i]].append(i+1)
    return total


def write_manifold(output_path, manifold_type, dim, df):
    if manifold_type.lower() == 'complex':
        k_i = len(df['cluster'].unique())
        with open(output_path, 'a') as f:
            f.write("%i %i Complex\n" % (dim, k_i))
            for i in range(k_i):
                indexes = df[df['cluster']==i].index
                f.write("%i " % len(indexes))
                f.write(" ".join(str(e) for e in indexes))
                f.write("\n")
        

def write_output(output_path, input_params, dim:dict, df):
    d, n, m, k, rho, k_i = input_params
    with open(output_path, 'w') as f:
        f.write("%i %i\n" % (n, m))
    for i in range(m):
        man_df = df[df['manifold']==i]
        write_manifold(output_path, 'complex', dim[i], man_df)

        
        
def create_df(point_list):
    return pd.DataFrame(point_list, index=pd.RangeIndex(1, len(point_list)+1, 1))


def find_initial_components(df, m):
    points = df.values
    net = Network()
    for i in range(1, points.shape[0]+1):
        p = Point(i, tuple(points[i-1]))
        net.add_node(p)
    for i in net.node_dict.keys():
        for j in net.node_dict.keys():
            d = math.dist(net.node_dict[i].coords, net.node_dict[j].coords)
            if d < thr:
                net.add_undir_link()
                
                

def cpp_run(file_path):

    process = subprocess.Popen(file_path, shell=False)

    out, err = process.communicate()
    errcode = process.returncode
    
    process.kill() 
    process.terminate()
                
                
def mst_read(input_dir, k):
    edges = []
    with open(input_dir, 'r') as f:
        lines = f.readlines()
        if k != 1:
            for line in lines[:-(k-1)]:
                node1, node2, weight = line.strip().split()
                edges.append((int(node1), int(node2), float(weight)))
        else:
            for line in lines:
                node1, node2, weight = line.strip().split()
                edges.append((int(node1), int(node2), float(weight)))
    return edges


def dfs_read(input_dir):
    label = []
    with open(input_dir, 'r') as f:
        lines = f.readlines()
        for line in lines[1:]:
            label.append(int(line))
    return label
            

def manifold(df, clusters=None):
    print("Creating Network ...")
    net = Network()
    for i in range(1, len(df)+1):
        coords = []
        for j in range(len(df.columns)):
            coords.append(df.iloc[i-1][j])
        p = Point(i, coords=coords)
        net.add_node(p)
    net.create_complete_graph()
    print("Finding Minimal Spanning Tree ...")
    net.edge_list_out('kruskal_in.txt')
    cpp_run('./kruskal')
    eds = mst_read('kruskal_out.txt', 1)
    resolution = 1
    while True:
        print("Applying Louvain algorithm with resolution %f ..." % (resolution))
        manifolds = net.louvain(eds, resolution)
        print("%i manifolds detected." % (len(manifolds)))
        if clusters == None:
            clusters = math.inf
        if len(manifolds) > clusters:
            resolution -= 0.1
        else:
            break
    print("Creating new dataframe with detected manifolds ...")
    mans = []
    for i in range(len(df)):
        for j, manifold in enumerate(manifolds):
            if (i+1) in manifold:
                mans.append(j+1)
                break
        
    new_df = df.copy()
    new_df['manifold'] = mans
    return new_df    
    
    
def cluster(df, m, k):
    if k == 1:
        df['manifold'] = [m for i in range(len(df))]
        df['cluster'] = [1 for i in range(len(df))]
        return df
    net = Network()
    for i in range(1, len(df)+1):
        coords = []
        for j in range(len(df.columns)):
            coords.append(df.iloc[i-1][j])
        p = Point(i, coords=coords)
        net.add_node(p)

    net.create_complete_graph()
    net.edge_list_out('kruskal_in.txt')
    cpp_run('./kruskal')
    eds = mst_read('kruskal_out.txt', k)
    net2 = Network()
    for i in range(1, len(df)+1):
        coords = []
        for j in range(len(df.columns)):
            coords.append(df.iloc[i-1][j])
        p = Point(i, coords=coords)
        net2.add_node(p)
    for i in eds:
        net2.add_undir_link(i[0], i[1])
    net2.net_out('dfs_in.txt')
    cpp_run('./dfs')
    lb = dfs_read('dfs_out.txt')
    df['manifold'] = [m for i in range(len(df))]
    df['cluster'] = lb
    return df
            
            
def manifold_to_cluster(df, k_i):
    dim = len(df.columns)-1
    m = len(df['manifold'].unique())
    manifolds = []
    clustered_manifolds = []
    for i in range(m):
        man = df[df['manifold']==i+1]
        manifolds.append(man)
    manifolds.sort(key=lambda x:len(x), reverse=True)
    k_i.sort(reverse=True)
    dic = dict()
    for i in range(dim):
        dic[i] = []
    dic['manifold'] = []
    dic['cluster'] = []
    last_df = pd.DataFrame(dic)
    for i in range(len(manifolds)):
        new_df = cluster(manifolds[i].drop('manifold', axis=1), i+1, k_i[i])
        last_df = last_df.append(new_df)
    return last_df
    
    
def prepare_output(first_df, last_df):
    data = []
    for i in tqdm(first_df.index):
        man = -1
        cl = -1
        coords = first_df.loc[i].values
        mn = math.inf
        for k in last_df.index:
            w = last_df.loc[k].values[:-2]
            d = math.dist(coords, w)
            if d < mn:
                mn = d
                man = last_df['manifold'].loc[k]
                cl = last_df['cluster'].loc[k]
        data.append((i, man, cl))
    return data


def outlier(points, thr):
    outlier_idxs = set()
    for col in tqdm(range(points.shape[1])):
        dim = []
        for row in range(points.shape[0]):
            dim.append(((row+1), points[row,col]))
        dim.sort(key=lambda x:x[1])
        for i in range(1, len(dim)-1):
            if dim[i][1] - dim[i-1][1] > thr and dim[i+1][1] - dim[i][1] > thr:
                outlier_idxs.add(dim[i][0])
    return outlier_idxs
#                 print("%i %f %f %f" % (i+1, first_dim[i-1], first_dim[i], first_dim[i+1]))

def find_outliers(points, a, b, key):
    res = 'a'
    it = 0
    indexes = []
    results = []
    thresholds = []
    while res != key:
        c = (a + b)/2
        thresholds.append(c)
        print("iteration: %i\nthreshold: %f" % (it+1, c))
        idxs = outlier(points, c)
        indexes.append(idxs)
        res = len(idxs)
        results.append(res)
        if res > key:
            a = c
        elif res < key:
            b = c
        it += 1
        if it > 20:
            for i in range(len(indexes)-1, -1, -1):
                if results[i] >= key:
                    return thresholds[i], indexes[i], results[i]
        print("outliers: %i" % res)
        print("==================================")
    return c, idxs, res



def write_manifold(output_path, manifold_type, dim, df):
    if manifold_type.lower() == 'complex':
        k_i = len(df['cluster'].unique())
        with open(output_path, 'a') as f:
            f.write("%i %i Complex\n" % (dim, k_i))
            for i in range(k_i):
                indexes = df[df['cluster']==i+1].index
                f.write("%i " % len(indexes))
                f.write(" ".join(str(e) for e in indexes))
                f.write("\n")
        


def write_outlier(output_path, df):
    out = df[df['manifold'] == -1]
    out_idx = out.index
    with open(output_path, 'a') as f:
        f.write("%i "% len(out))
        f.write(" ".join(str(e) for e in out_idx))
        f.write("\n")

        
def write_output(output_path, input_params, dim:dict, df):
    d, n, m, k, rho, k_i = input_params
    with open(output_path, 'w') as f:
        f.write("%i %i\n" % (n, m))
    for i in range(m):
        man_df = df[df['manifold']==i+1]
        write_manifold(output_path, 'complex', dim[i+1], man_df)
#     write_outlier(output_path, df)
        

### Reading input

In [3]:
input_params, df = read_input("./R43.txt")
show_input_params(input_params)

d: 100
n: 83732
m: _
k: 49
k_i: 
rho: 600


### Outlier detection

In [11]:
thr, outlier_indexes, res = find_outliers(df.values, 0.4, 0.5, 600)

iteration: 1
threshold: 0.450000


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:49<00:00,  2.03it/s]


outliers: 633
iteration: 2
threshold: 0.475000


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:50<00:00,  1.99it/s]


outliers: 608
iteration: 3
threshold: 0.487500


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:48<00:00,  2.06it/s]


outliers: 595
iteration: 4
threshold: 0.481250


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:49<00:00,  2.02it/s]


outliers: 595
iteration: 5
threshold: 0.478125


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:50<00:00,  2.00it/s]


outliers: 608
iteration: 6
threshold: 0.479687


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:49<00:00,  2.01it/s]


outliers: 608
iteration: 7
threshold: 0.480469


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:49<00:00,  2.03it/s]


outliers: 595
iteration: 8
threshold: 0.480078


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:48<00:00,  2.05it/s]


outliers: 595
iteration: 9
threshold: 0.479883


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:50<00:00,  1.98it/s]


outliers: 608
iteration: 10
threshold: 0.479980


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:50<00:00,  1.98it/s]


outliers: 608
iteration: 11
threshold: 0.480029


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:50<00:00,  1.98it/s]


outliers: 595
iteration: 12
threshold: 0.480005


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:52<00:00,  1.91it/s]


outliers: 596
iteration: 13
threshold: 0.479993


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:48<00:00,  2.04it/s]


outliers: 608
iteration: 14
threshold: 0.479999


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:50<00:00,  2.00it/s]


outliers: 604
iteration: 15
threshold: 0.480002


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:48<00:00,  2.04it/s]

outliers: 600





In [17]:
clean_df = df.drop(outlier_indexes, axis=0)

### Vector quantisation

In [18]:
%%time
n_out = 2000
max_iter = 40
# points, labels = vector_quantization_mini('kmeans', df.values, n_out, max_iter)
points, labels = vector_quantization_mini('mini-batch-kmeans', clean_df.values, n_out, max_iter)
qdf = create_df(points)
# quantized_points = gng(points=points, max_iter=10000, epsilon_b=0.5, epsilon_n=0.1, a_max=10, lambda_=10, alpha=0.01, D=0.01)


CPU times: total: 2min 43s
Wall time: 2min 16s


### Clustering manifolds

In [25]:
%%time
# max_man = 49
manifold_df = manifold(qdf, 49)

Creating Network ...
Finding Minimal Spanning Tree ...
Applying Louvain algorithm with resolution 1.000000 ...
60 manifolds detected.
Applying Louvain algorithm with resolution 0.900000 ...
57 manifolds detected.
Applying Louvain algorithm with resolution 0.800000 ...
53 manifolds detected.
Applying Louvain algorithm with resolution 0.700000 ...
54 manifolds detected.
Applying Louvain algorithm with resolution 0.600000 ...
47 manifolds detected.
Creating new dataframe with detected manifolds ...
CPU times: total: 1min 12s
Wall time: 1min 27s


### Clustering inside manifolds

In [27]:
%%time
k_i = [1 for i in range(45)]
k_i.append(2)
k_i.append(2)
last = manifold_to_cluster(manifold_df, k_i)

CPU times: total: 6.45 s
Wall time: 7.26 s


### Finding original labels of points

In [30]:
mans = []
clus = []
for i in tqdm(range(len(clean_df))):
    mans.append(last.loc[labels[i]+1]['manifold'])
    clus.append(last.loc[labels[i]+1]['cluster'])


100%|███████████████████████████████████████████████████████████████████████████| 83132/83132 [01:25<00:00, 974.05it/s]


In [31]:
input_params[2] = 47
input_params[5] = k_i

[100,
 83732,
 47,
 49,
 600,
 [2,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]]

### Exporting output

In [32]:
clean_df['manifold'] = mans
clean_df['cluster'] = clus
dim = dict()
for i in range(47):
    dim[i+1] = 1
write_output('./output_R43.txt', input_params, dim, clean_df)

In [33]:
with open('./output_R43.txt', 'a') as f:
    f.write("%i " % 600)
    f.write(" ".join(str(e) for e in outlier_indexes))
    f.write("\n")