# runner

> runner functions

In [None]:
#| default_exp runner

In [None]:
#| hide
from nbdev.showdoc import *
%load_ext autoreload
%autoreload 2

In [None]:
#| export
from emb_opt.imports import *
from emb_opt.utils import pack_dataframe
from emb_opt.core import VectorDatabase, Score, Filter, PassThroughFilter
from emb_opt.query_update import QueryUpdate

from emb_opt.backends.hf import HFDatabase
from emb_opt.query_update import RLUpdate

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#| export

class SearchLog():
    'Logs results from `Runner`'
    def __init__(self):
        self.batch_log = {}
        
    def add_entry(self, 
                  iteration: int, 
                  query_vectors: np.ndarray, 
                  query_results: Dataset
                 ):
        self.batch_log[iteration] = {'queries' : query_vectors, 'results' : query_results.to_pandas()}
        
    def compile_results(self) -> Dataset:
        results = []
        seen_keys = set()

        for k,v in self.batch_log.items():
            query_results = v['results']

            for idx, row in query_results.iterrows():
                row = dict(row)
                if not (row['db_idx'] in seen_keys):
                    row.pop('query_idx')
                    row.pop('distance')
                    results.append(row)
                    seen_keys.update({row['db_idx']})
                    
        output = pd.DataFrame(results)
        output = output.sort_values('score', ascending=False)
        return output

#         output = Dataset.from_list(results)
#         output = output.sort('score', reverse=True)
#         return output
    
    def compile_trajectories(self) -> dict:
        
        n_queries = self.batch_log[0]['queries'].shape[0]
        n_iters = len(self.batch_log.keys())
        trajectories = {i:{'query_vectors':[], 'scores':[]} for i in range(n_queries)}
        
        for iteration in range(n_iters):
            queries = self.batch_log[iteration]['queries']
            score_dict = pack_dataframe(self.batch_log[iteration]['results'], 'query_idx', ['score'])
            
            for query_idx in range(n_queries):
                trajectories[query_idx]['query_vectors'].append(queries[query_idx])
                trajectories[query_idx]['scores'].append(score_dict[query_idx]['score'])

        for query_idx in range(n_queries):
            trajectories[query_idx]['query_vectors'] = np.stack(trajectories[query_idx]['query_vectors'])
            
        return trajectories

## Runner

The `Runner` class combines a `VectorDatabase`, a `Score`, a `QueryUpdate` and a `Filter` to search a vector database for high scoring items. The search loop:

* 1. Start with `query_vectors`
* 2. Query `VectorDatabase` with `query_vectors` to get `query_results`
* 3. Filter `query_results` with `Filter`
* 4. Score `query_results` with `Score`
* 5. Use `QueryUpdate` to generate a new set of `query_vectors`

In [None]:
#| export

class Runner():
    'Runs embedding optimization search'
    def __init__(self, 
                 vector_db: VectorDatabase, # vector database backend
                 score: Score, # score function
                 query_update: QueryUpdate, # query update
                 filter: Optional[Filter]=None # optional filter
                ):
        
        self.vector_db = vector_db
        self.score = score
        self.query_update = query_update
        self.filter = filter if filter else PassThroughFilter()
        
    def step(self, iteration, query_vectors, log=None):
        query_results = self.vector_db.query(query_vectors)
        query_results = self.filter(query_results)
        query_results = self.score(query_results)

        if log:
            log.add_entry(iteration, query_vectors, query_results)

        query_vectors = self.query_update(query_vectors, query_results)
        return query_vectors
        
    def search(self, 
               query_vectors: np.ndarray, 
               iterations: int
              ) -> SearchLog:
        log = SearchLog()
        
        for i in range(iterations):
            query_vectors = self.step(i, query_vectors, log)
            
        return log

In [None]:
def dummy_score(row):
    return np.linalg.norm(row['embedding'])

vectors = np.random.randn(128, 256)
vector_dataset = Dataset.from_list([{'embedding' : i} for i in vectors])
vector_dataset.add_faiss_index('embedding')

db = HFDatabase(vector_dataset, 'embedding', 10)
score = Score(dummy_score)
update_strategy = RLUpdate(0.5)

runner = Runner(db, score, update_strategy)

query_vectors = np.random.randn(4, 256)/10
log = runner.search(query_vectors, 5)

res = log.compile_results()
traj = log.compile_trajectories()

log = SearchLog()
_ = runner.step(0, query_vectors)
_ = runner.step(0, query_vectors, log)

100%|████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2007.80it/s]
                                                                                             

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()