In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

import json
import uuid

es = Elasticsearch(["http://localhost:9200"])

In [2]:
import os
import sys

sys.path.insert(0, os.path.join(os.pardir, 'ml_service'))

In [3]:
artifacts_dir = os.path.join(os.getcwd(), os.pardir, os.pardir, 'additional_data')

In [4]:
os.environ['EMB_PATH_GLOVE'] = os.path.join(artifacts_dir, 'glove.6B.50d.txt')
os.environ['EMB_PATH_KNRM'] = os.path.join(artifacts_dir, 'embeddings.bin')
os.environ['MLP_PATH'] = os.path.join(artifacts_dir, 'knrm_mlp.bin')
os.environ['VOCAB_PATH'] = os.path.join(artifacts_dir, 'vocab.json')

In [6]:
from main import MLService
from model import RankingDataset, collate_fn

In [16]:
class CandidateModel:
    def __init__(self, es, index_name):
        self.es = es
        self.index_name = index_name

    def _fuzzy_search(self, query, size):
        body = {
            "size": size,
            "query": {
                "match": {
                    "question": {
                        "query": query,
                        "fuzziness": "AUTO"
                    }
                }
            }
        }
        response = self.es.search(index=self.index_name, body=body)
        return response["hits"]["hits"]

    def query(self, q, size=10):
        response = self._fuzzy_search(q, size=size)
        return [(i['_source']['index'], i['_source']['question']) for i in response]

In [17]:
candidate_model = CandidateModel(es, 'question_base')

In [21]:
q = 'How to learn to skate?'

In [20]:
from typing import Dict, List, Tuple, Callable

import numpy as np
import torch

class RankingDataset(torch.utils.data.Dataset):
    def __init__(self, query: str, candidates: List,
                 vocab: Dict[str, int], oov_val: int,
                 preproc_func: Callable, max_len: int = 30):
        self.query = query
        self.candidates = {n: k for n, k in enumerate(candidates)}
        self.vocab = vocab
        self.oov_val = oov_val
        self.preproc_func = preproc_func
        self.max_len = max_len

    def __len__(self):
        return len(candidates)

    def _tokenized_text_to_index(self, tokenized_text: List[str]) -> List[int]:
        res = [self.vocab.get(i, self.oov_val) for i in tokenized_text]
        return res

    def _convert_text_to_token_idxs(self, text: str) -> List[int]:
        tokenized_text = self.preproc_func(text)
        idxs = self._tokenized_text_to_index(tokenized_text)
        return idxs

    def __getitem__(self, idx: int):
        doc_label = self.candidates[idx]
        left_elem = {}
        left_elem['query'] = self._convert_text_to_token_idxs(self.query)
        left_elem['document'] = self._convert_text_to_token_idxs(self.candidates[idx])
        return left_elem

In [22]:
ml_service = MLService()

  embedding_matrix = torch.load(emb_path)
  self.mlp.load_state_dict(torch.load(mlp_path))


In [46]:
candidates = candidate_model.query(q, size=20)
candidates = [i[1] for i in candidates]

INFO:elastic_transport.transport:POST http://localhost:9200/question_base/_search [status:200 duration:0.065s]


In [47]:
candidates

[('408483',
  'Is it easier to learn to skate on figure skates or hockey skates?'),
 ('178676', 'How long does it take to learn to ice skate?'),
 ('355016', 'What are some tips to learn how to skate backwards?'),
 ('232101', 'Do kids ever learn to ice skate before they learn to walk?'),
 ('49378', 'How do you ice skate?'),
 ('60127', 'I have learnt kathak so should I also learn contemporary now?'),
 ('93189', 'How does one learn to learn?'),
 ('75243', 'How can an individual learn how to learn?'),
 ('499175', 'Learn how to code or learn how to sell?'),
 ('148509', "How did Richard Muller 'learn to learn'?"),
 ('148508', '"How did Richard Muller "" Learn to learn""?"'),
 ('242524', 'How does a person learn to learn?'),
 ('355215', 'How children learn?'),
 ('30999', 'How do you play Skate 3 on PC?'),
 ('347786', 'How do I learn to learn the confident trait?'),
 ('41481', 'How do learn Telepathy?'),
 ('137596', 'How do people learn?'),
 ('237079', 'How to learn piano?'),
 ('14720', 'How t

In [28]:
ds = RankingDataset(query=q,
                candidates=candidates,
                vocab=ml_service.vocab,
                oov_val=ml_service.vocab['OOV'],
                preproc_func=ml_service.simple_preproc)

In [32]:
dl = torch.utils.data.DataLoader(
    ds, 
    batch_size=ml_service.dataloader_bs, 
    num_workers=0,
    collate_fn=collate_fn, 
    shuffle=False)

score = ml_service.predict(dl)

In [42]:
def pipeline(query):
    candidates = candidate_model.query(query, size=20)
    candidates = [i[1] for i in candidates]
    ds = RankingDataset(query=query,
                candidates=candidates,
                vocab=ml_service.vocab,
                oov_val=ml_service.vocab['OOV'],
                preproc_func=ml_service.simple_preproc)
    dl = torch.utils.data.DataLoader(
        ds, 
        batch_size=ml_service.dataloader_bs, 
        num_workers=0,
        collate_fn=collate_fn, 
        shuffle=False)
    
    score = ml_service.predict(dl)
    return np.array(candidates)[np.argsort(score)]

In [45]:
pipeline('How to learm python?')

INFO:elastic_transport.transport:POST http://localhost:9200/question_base/_search [status:200 duration:0.028s]


array(['Should I learn Python?', 'Learn Python?',
       'What is the Python conda root on iPython?',
       'What is better to learn first python2x or Python 3x?',
       'How do I learn Python for scripting?',
       'How do I learn Python at home?',
       'Python (programming language): How can I learn Python quickly and efficiently?',
       'How do I learn Python in depth?',
       'How can you learn Python algorithms?',
       'How can I learn advanced Python?',
       'How can I learn python online?', 'How do I learn python online?',
       'How do I learn Python?', 'How do I learn Python?',
       'How do I learn python?', 'How do I learn Python systematically?',
       'How can I learn phython?', 'How important is it to learn Python?',
       'How easy is it to learn Python?',
       'How difficult is it to learn Python?'], dtype='<U78')