In [None]:
%cd ..

In [None]:
import sys

sys.path.extend(('src', 'lib'))

In [None]:
from pathlib import Path
from importlib import reload
import csv
import math
import itertools

import numpy as np
import torch
from tqdm.auto import tqdm
import voyager
import cv2
from matplotlib import pyplot as plt
from pytorch_metric_learning import losses

import voc, fcom
import data, utils, predictors, embedding, embedding_img

In [None]:
data_dir = Path('data')
models_dir = Path('models')
embeddings_dir = Path('voc_embeddings')

In [None]:
train_ds = data.Dataset(
    data.BinaryDataset.load('data/train.npz'),
    data.read_words(data_dir / 'train.ref'),
)
val_ds = data.Dataset(
    data.BinaryDataset.load('data/valid.npz'),
    data.read_words(data_dir / 'valid.ref'),
)
val_500_ds = data.Dataset(
    data.BinaryDataset.load(data_dir / 'valid_500.npz'),
    data.read_words(data_dir / 'valid_500.ref'),
)

vocabulary = utils.Vocabulary.load(data_dir / 'vocabulary.csv')
keyboard_grids = utils.load_grids(data_dir / 'grids.json')

In [None]:
preprocessor = embedding.SeqPreprocessor(keyboard_grids, vocabulary)
train_pair_ds = embedding.TargetPairDataset(train_ds, vocabulary, preprocessor, 500)
val_pair_ds = embedding.TargetPairDataset(val_ds, vocabulary, preprocessor, 500)
val_500_pair_ds = embedding.TargetPairDataset(val_500_ds, vocabulary, preprocessor, 500)

In [None]:
reload(data)
reload(utils)
reload(embedding)

In [None]:
torch.manual_seed(42)
np.random.seed(42)
model_name = 'emb_tf_v0_best'
model_path = models_dir / f'{model_name}.pt'
model = embedding.TransformerEmbedder(emb_dim=64, n_heads=4, n_layers=4).cuda()
opt = torch.optim.Adam(model.parameters(), lr=8e-5)
scheduler = torch.optim.lr_scheduler.StepLR(opt, 10, .5, verbose=True)

In [None]:
embedding.train(
    model, opt, scheduler,
    train_pair_ds, val_pair_ds, 
    100, 100, 
    losses.ContrastiveLoss(), embedding.center_dist_penalty, .0,
    model_path
)

In [None]:
torch.save(model.state_dict(), model_path)

In [None]:
model.load_state_dict(torch.load(model_path))
model = model.eval()

In [None]:
all_words = data.read_words(data_dir / 'voc.txt')
max(map(len, all_words))

In [None]:
grid_word_embeddings = embedding.calculate_word_embeddings(model, all_words, keyboard_grids, preprocessor, 5000)

In [None]:
np.savez(embeddings_dir / f'{model_name}.npz', **grid_word_embeddings)

In [None]:
model = model.cpu()

In [None]:
model = embedding.LSTMEmbedder()
model.load_state_dict(torch.load('models/emb_lstm_v0.pt'))
grid_word_embeddings = np.load('voc_embeddings/emb_lstm_v0.npz')

In [None]:
reload(data)
reload(utils)
reload(embedding)

In [None]:
candidate_gen = embedding.EmbeddingCandidateGenerator(model, grid_word_embeddings, preprocessor, vocabulary, 500, 64, 8)

In [None]:
candidate_gen.n_candidates = 1000

In [None]:
for i in itertools.islice(val_500_ds, 30):
    cands = candidate_gen(i.trace)
    cand_words = [c.word for c in cands]
    print(i.word in cand_words, i.word)#, ','.join(cand_words))

In [None]:
flags = []
for i in itertools.islice(val_500_ds, 100):
    cands = candidate_gen(i.trace)
    cand_words = [c.word for c in cands]
    flags.append(i.word in cand_words)
    # print(i.word in cand_words, i.word, ','.join(cand_words))
np.mean(flags)

In [None]:
def default_emedding(word):
    return grid_word_embeddings['default'][vocabulary.words.index(word)]

In [None]:
np.linalg.norm(default_emedding('побег') - default_emedding('пробег'))