# 5. Medoids

We create medoids from all representation vectors.

## Imports

In [5]:
from glob import glob
from tqdm.auto import tqdm
import json
import numpy as np
import os

from pathlib import Path

from picto2vec.sense import Sense

from sklearn.metrics import pairwise_distances
from sklearn_extra.cluster import KMedoids

In [3]:
from constants import *

## Amassing representations

In [4]:
sense_files = glob(f"{SENSE_EX_VEC}/*")

In [76]:
def create_output_filenames(medoid_count, layer_index, sense_file_stem):
    layer_path = f"{VECTORS}/medoid_{medoid_count}/layer_{layer_index}/"
    json_path = f"{layer_path}{sense.name}.json"
    
    return layer_path, json_path

We go over each sense file, and for each medoid count, we compute the required number of medoids. Then, we write the representations of our medoids to another file. The structure will be as follows:
* vector directory root
    * medoid_3
        * layer_0
            * sense1.json
            * sense2.json
            * ...
        * ...
        * layer_12
    * medoid_5
    * medoid_7
    * medoid_10
    
Each JSON file holds all medoid representations as a simple list.

In [None]:
for sense_file in tqdm(sense_files):
    try:
        with open(sense_file, "rt") as reader:
            raw = reader.read()
            if len(raw) == 0:
                continue
            
            data = json.loads(raw)
    except:
        print(sense_file)
        break
    
    sense = Sense(Path(sense_file).stem, data)
    
    if sense.total_representation_count() <= 30:
        continue
    
    for medoid_count in [3, 5, 7, 10]:
        if os.path.exists(create_output_filenames(medoid_count, 0, sense.name)[1]):
            continue

        for layer_index in list(range(0, 13)):
            layer_path, json_path = create_output_filenames(medoid_count, layer_index, sense.name)
        
            representations = sense.get_representations(layer_index)
            medoid_indices = sense.get_medoid_indices(representations, medoid_count)
    
            centroid_representations = []
    
            for medoid_index in medoid_indices:
                centroid_representations.append(list(representations[medoid_index]))

            if not os.path.exists(layer_path):
                os.makedirs(layer_path)
            
            with open(json_path, "wt") as writer:
                writer.write(json.dumps(centroid_representations))

In [4]:
len(sense_files)

1824