# Evaluation of hyperparameters

In [None]:
import numpy as np
from sklearn.metrics import pairwise_distances
from ripser import ripser
import matplotlib.pyplot as plt

## Betti numbers

In [None]:
def plot_betti_numbers_dynamics(diagrams, space_type):
    for dim, segments in enumerate(diagrams):
        epsilons = np.sort(list(set(segments.reshape(-1,))))

        indices = dict()
        for i in range(len(epsilons)):
            indices[epsilons[i]] = i

        homologies_num = np.zeros(len(epsilons))
        for l, r in segments:
            homologies_num[indices[l]] += 1
            homologies_num[indices[r]] -= 1

        for i in range(1, len(epsilons)):
            homologies_num[i] += homologies_num[i - 1]

        plt.plot(epsilons, homologies_num)

        plt.title(f'$H_{dim}$ dynamics', fontsize=14)
        plt.xlabel('$\epsilon$', fontsize=12)
        plt.ylabel(f'$H_{dim}$    ', rotation='horizontal', fontsize=12)
        plt.grid()

        np.save(
            f'parameters/{space_type}/H{dim}_row.npy',
            np.vstack((epsilons, homologies_num)).T
        )
        plt.savefig(f'parameters/{space_type}/H{dim}', dpi=1200)
        plt.show()

def compute_persistence(embeddings, space_type):
    persistence = ripser(
        pairwise_distances(embeddings, metric='euclidean'), distance_matrix=True, maxdim=1
    )
    np.save(f'parameters/{space_type}/persistence.npy', persistence)

    plot_betti_numbers_dynamics(persistence['dgms'], space_type)

In [None]:
for embeddings_type in ('SVD', 'CBoW'):
    embeddings_dict = np.load(
        f'data/Russian/Russian_dict_{embeddings_type}_8.npy', allow_pickle=True
    ).item()

    compute_persistence(
        np.array(list(embeddings_dict.values())), embeddings_type
    )

## CBoW lower bound

In [None]:
synonym_dict = np.load('data/Russian_synonym_dict.npy', allow_pickle=True).item()
embedding_dict = np.load('data/Russian/Russian_dict_CBoW_8.npy', allow_pickle=True).item()
dict_tokens = set(embedding_dict.keys())

In [6]:
synonym_gropus = [[key.lower()] + synonym_dict[key] for key in synonym_dict.keys()]

max_dist_in_synonym_groups = []
for synonym_group in synonym_gropus:
    embeddings = []
    for lexeme in synonym_group:
        try: embeddings.append(embedding_dict[lexeme])
        except KeyError: continue
    
    if len(embeddings) == 0: continue

    max_dist_in_synonym_groups.append(
        np.max(pairwise_distances(embeddings), axis=(0, 1))
    )

print(len(synonym_gropus), np.max(max_dist_in_synonym_groups).round(3))

(2658, np.float64(1.017))
