In [None]:
%cd ..

In [None]:
import sys

sys.path.extend(('src', 'lib'))

In [None]:
from pathlib import Path
from importlib import reload
from typing import Callable
import csv
import math
import itertools
import random

import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm
import voyager
import cv2
from matplotlib import pyplot as plt
from pytorch_metric_learning import losses
from tslearn.metrics import dtw
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRanker, Pool

import data
import utils
import prediction
import metrics
import features
import embeddings

In [None]:
data_dir = Path('data')
models_dir = Path('models')
embeddings_dir = Path('voc_embeddings')

In [None]:
train_ds = data.Dataset(
    data.BinaryDataset.load('data/train.npz'),
    data.read_words(data_dir / 'train.ref'),
)
val_ds = data.Dataset(
    data.BinaryDataset.load('data/valid.npz'),
    data.read_words(data_dir / 'valid.ref'),
)
val_500_ds = data.Dataset(
    data.BinaryDataset.load(data_dir / 'valid_500.npz'),
    data.read_words(data_dir / 'valid_500.ref'),
)

vocabulary = utils.Vocabulary.load(data_dir / 'vocabulary.csv')
keyboard_grids = utils.load_grids(data_dir / 'grids.json')

In [None]:
def calc_candidates_recall(candidates_generator, ds):
    found_flags = []
    for batch in utils.batch_iterable(ds, 500):
        batch_candidates = candidates_generator([s.trace for s in batch])
        found_flags.extend([
            any(s.word == c.word for c in cs)
            for s, cs in zip(batch, batch_candidates)
        ])
    return np.mean(found_flags)

In [None]:
img_preprocessor = embeddings.data.ImgPreprocessor(keyboard_grids, vocabulary)
seq_preprocessor = embeddings.data.SeqPreprocessor(keyboard_grids, vocabulary)

In [None]:
%ls models

In [None]:
# mlp_modle_name = 'emb_mlp_v3_triplet_100'
mlp_model_name = 'emb_mlp_v4_best'
mlp_model = embeddings.models.MLPImgEmbedder().cuda()
mlp_model.load_state_dict(torch.load(models_dir / f'{mlp_model_name}.pt'))
mlp_embs = dict(np.load(embeddings_dir / f'{mlp_model_name}.npz'))

In [None]:
cnn_model_name = 'emb_cnn_v0_best'
cnn_model = embeddings.models.ConvImgEmbedder().cuda()
cnn_model.load_state_dict(torch.load(models_dir / f'{cnn_model_name}.pt'))
cnn_embs = dict(np.load(embeddings_dir / f'{cnn_model_name}.npz'))

In [None]:
lstm_model_name = 'emb_rnn_v0_best'
lstm_model = embeddings.models.LSTMEmbedder().cuda().eval()
lstm_model.load_state_dict(torch.load(models_dir / f'{lstm_model_name}.pt'))
lstm_embs = dict(np.load(embeddings_dir / f'{lstm_model_name}.npz'))

In [None]:
%%time
# cnn_candidate_gen = embedding.EmbeddingCandidateGenerator(cnn_model, cnn_embs, img_preprocessor, vocabulary, keyboard_grids, 2000, 64, 7)
cnn_candidate_gen = embeddings.utils.FAISSEmbeddingCandidateGenerator(cnn_model, cnn_embs, img_preprocessor, vocabulary, keyboard_grids, 2000, 64, 6)
# calc_candidates_recall(cnn_candidate_gen, val_ds[:1000])

In [None]:
class MergingCandidateGenerator:
    def __init__(self, generators: list[Callable[[list[utils.Trace]], list[list[utils.Candidate]]]]):
        self.generators = generators
    
    def __call__(self, traces: list[utils.Trace]) -> list[list[utils.Candidate]]:
        generator_candidates = [g(traces) for g in self.generators]
        return [list(itertools.chain.from_iterable(gcs)) for gcs in zip(*generator_candidates)]

In [None]:
features_extractor = predictors.FeaturesExtractor({
    'popularity': predictors.PopularityCalculator(vocabulary),
    'interpolated_dtw': predictors.InterpolatedDTWCalculator(100),
    'target_length': predictors.target_trace_length,
    'candidate_length': predictors.candidate_trace_length,
    'mlp_dist': embedding.EmbeddingDistCalculator(img_preprocessor, mlp_model, mlp_embs, vocabulary),
    'cnn_dist': embedding.EmbeddingDistCalculator(img_preprocessor, cnn_model, cnn_embs, vocabulary),
})

In [None]:
ranker_ds_size = 10_000
ds_start_offset = 5_000_000
ranker_x, ranker_y = predictors.make_scorer_ds(
    tqdm(itertools.islice(train_ds, ds_start_offset, ds_start_offset + ranker_ds_size), total=ranker_ds_size), 
    cnn_candidate_gen, 
    features_extractor, 
    keyboard_grids,
    # random.choice,
    predictors.ExpSampler(.001),
    batch_size=1000,
)
ranker_train_x, ranker_val_x, ranker_train_y, ranker_val_y = train_test_split(ranker_x, ranker_y)

In [None]:
features_extractor_np = predictors.FeaturesExtractorNP([
    predictors.PopularityCalculator(vocabulary),
    predictors.InterpolatedDTWCalculator(100),
    predictors.target_trace_length,
    predictors.candidate_trace_length,
    predictors.trace_length_diff,
    predictors.trace_length_ratio,
    predictors.keyboard_grid,
    embedding.EmbeddingDistCalculator(img_preprocessor, mlp_model, mlp_embs, vocabulary),
    embedding.EmbeddingDistCalculator(img_preprocessor, cnn_model, cnn_embs, vocabulary),
    embedding.EmbeddingDistCalculator(seq_preprocessor, lstm_model, lstm_embs, vocabulary),
])

In [None]:
np.random.seed(42)
random.seed(42)
ranker_ds_size = 100#_000
ds_start_offset = 5_000_000
ranker_ds = train_ds[ds_start_offset:ds_start_offset + ranker_ds_size]
ranker_train_ds, ranker_val_ds = train_test_split(ranker_ds)
sampler = prediction.ExpSampler(.001)
ranker_train_features, ranker_train_labels, ranker_train_groups, ranker_train_pairs = prediction.make_ranking_ds(
    tqdm(ranker_train_ds), cnn_candidate_gen, features_extractor_np, keyboard_grids, sampler, 5, 1000
)
ranker_val_features, ranker_val_labels, ranker_val_groups, ranker_val_pairs = predictionfeatures.InterpolatedDTWCalculator(100).make_ranking_ds(
    tqdm(ranker_val_ds), cnn_candidate_gen, features_extractor_np, keyboard_grids, sampler, 5, 1000
)

In [None]:
trace_features_extractor = prediction.FeaturesExtractorNP([
    features.target_trace_length,
    features.keyboard_grid,
])
candidates_features_extractor = prediction.FeaturesExtractorNP([
    features.PopularityCalculator(vocabulary),
    features.InterpolatedDTWCalculator(100),
    features.candidate_trace_length,
    features.trace_length_diff,
    features.trace_length_ratio,
    embeddings.utils.EmbeddingDistCalculator(img_preprocessor, mlp_model, mlp_embs, vocabulary),
    embeddings.utils.EmbeddingDistCalculator(img_preprocessor, cnn_model, cnn_embs, vocabulary),
    embeddings.utils.EmbeddingDistCalculator(seq_preprocessor, lstm_model, lstm_embs, vocabulary),
])

In [None]:
%%time
np.random.seed(42)
random.seed(42)
ranker_ds_size = 200_000
ds_start_offset = 5_000_000
ranker_ds = train_ds[ds_start_offset:ds_start_offset + ranker_ds_size]
ranker_train_ds, ranker_val_ds = train_test_split(ranker_ds)
sampler = prediction.ExpSampler(.001)
pair_train_features, pair_train_labels = prediction.make_pairs_ds(
    tqdm(ranker_train_ds), cnn_candidate_gen, trace_features_extractor, candidates_features_extractor, sampler, 20, 500
)
pair_val_features, pair_val_labels = prediction.make_pairs_ds(
    tqdm(ranker_val_ds), cnn_candidate_gen, trace_features_extractor, candidates_features_extractor, sampler, 20, 500
)

In [None]:
np.savez('pairs_v0.npz', 
    pair_train_features=pair_train_features,
    pair_train_labels=pair_train_labels,
    pair_val_features=pair_val_features,
    pair_val_labels=pair_val_labels,
)

In [None]:
import torch
from pytorch_tabnet.tab_model import TabNetClassifier

clf = TabNetClassifier(
    n_d=64, n_steps=2, seed=42, 
    device_name='cuda', optimizer_params={'lr': 5e-2}, 
    scheduler_fn=torch.optim.lr_scheduler.StepLR, scheduler_params={'gamma':.5, 'step_size':5},
)
clf.fit(pair_train_features, pair_train_labels, eval_set=[(pair_val_features, pair_val_labels)], patience=10, batch_size=32768)

In [None]:
ranker = prediction.PairwiseRanker(clf.predict, trace_features_extractor, candidates_features_extractor)

In [None]:
clf = CatBoostClassifier(
    loss_function='CrossEntropy', eval_metric='AUC', 
    depth=12, n_estimators=2000, od_type='Iter', od_wait=200,
    random_state=42,
)
clf.fit(pair_train_features, pair_train_labels, eval_set=(pair_val_features, pair_val_labels))
ranker = prediction.PairwiseRanker(clf.predict, trace_features_extractor, candidates_features_extractor)

In [None]:
clf = CatBoostClassifier(eval_metric='Accuracy', depth=6, n_estimators=2000)
clf.fit(ranker_train_x, ranker_train_y, eval_set=(ranker_val_x, ranker_val_y))
ranker = predictors.ScoringRanker(lambda x: clf.predict_proba(x)[:, 0], features_extractor)

In [None]:
clf = CatBoostClassifier(eval_metric='Accuracy', depth=6, n_estimators=1000)
clf.fit(ranker_x, ranker_y)
ranker = predictors.ScoringRanker(lambda x: clf.predict_proba(x)[:, 0], features_extractor)

In [None]:
%%time
clf = CatBoostRanker(
    depth=4, n_estimators=2000, loss_function='QuerySoftMax', 
    od_type='Iter', od_wait=100,
    learning_rate=.2, task_type='GPU',
    random_seed=42
)
clf.fit(
    ranker_train_features, ranker_train_labels,
    pairs=None, group_id=ranker_train_groups,
    eval_set=Pool(ranker_val_features, ranker_val_labels, pairs=ranker_val_pairs, group_id=ranker_val_groups),
)
# ranker = predictors.ScoringRanker(lambda x: -clf.predict(x.reshape(x.shape[0] * x.shape[1], x.shape[2])), features_extractor_np)

In [None]:
def _ranker_func(x):
    # print(x.shape)
    ranks = np.arange(len(x)).reshape((-1, 1)) % 2000
    return -clf.predict(np.concatenate((x, ranks), axis=1))
ranker = predictors.ScoringRanker(_ranker_func, features_extractor_np)

In [None]:
%%time
predictor = prediction.Predictor(cnn_candidate_gen, ranker, 200)
res = predictor(tqdm([s.trace for s in itertools.islice(val_ds, 1000)]))
metrics.mrr_iterables(res, val_ds.words[:1000])

In [None]:
test_ds = data.BinaryDataset.load(data_dir / 'test.npz')

In [None]:
res = predictor(tqdm(test_ds))
data.save_results('result/test_emb_pw_tabnet.csv', res)

In [None]:
%%time
# predictor = predictors.Predictor(cnn_candidate_gen, ranker, 2000)
res = predictor(tqdm([s.trace for s in val_ds]))
metrics.mrr_iterables(res, val_ds.words)