# Advanced Tutorial

> Tutorial on advanced updates, pruning and gradient queries

## Overview

The previous tutorial gave an overview of the main abstractions in `emb_opt` for basic hill climbing. This notebook goes over some more advanced query updating strategies

In [None]:
from emb_opt.imports import *
from emb_opt.schemas import Query, Item, Batch, ScoreResponse, FilterResponse
from emb_opt.plugins.huggingface import HugggingfaceDataPlugin
from emb_opt.update import RLUpdate
from emb_opt.runner import Runner
from emb_opt.utils import build_batch_from_embeddings

import string
from datasets import Dataset

import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


## Setup

To start, we'll set up the same dataset, filter function and score function from the previous notebook

In [None]:
def get_dataset(n_vectors: int=10000, size: int=64):
    
    np.random.seed(42)
    vectors = np.random.randn(n_vectors, size)

    vector_data = [{
                    'index' : i,
                    'item' : ''.join(np.random.choice([i for i in string.ascii_lowercase], size=10).tolist()),
                    'rand' : np.random.rand(),
                    'embedding' : vectors[i]
                } for i in range(n_vectors)]

    vector_dataset = Dataset.from_list(vector_data)
    vector_dataset.add_faiss_index('embedding')
    
    return vector_dataset

def get_data_plugin(dataset: Dataset, k: int=10, distance_cutoff: Optional[float]=None):
    data_plugin = HugggingfaceDataPlugin(k=k, 
                                         dataset=dataset, 
                                         index_name='embedding', 
                                         item_key='item', 
                                         id_key='index', 
                                         distance_cutoff=distance_cutoff
                                        )
    return data_plugin

In [None]:
def score_embeddings(embeddings: np.ndarray, sigma: float=5.) -> np.ndarray:
    target_point = np.ones(embeddings.shape[1])*.75
    
    distances = np.linalg.norm(embeddings - target_point, axis=1) / np.sqrt(embeddings.shape[1])
    
    scores = np.exp(-0.5 * (distances/sigma)**2)
        
    return scores

def score_plugin(inputs: List[Item]) -> List[ScoreResponse]:
    embeddings = np.array([i.embedding for i in inputs])
    scores = score_embeddings(embeddings)    
    results = [ScoreResponse(valid=True, score=i, data=None) for i in scores]
    return results

In [None]:
def filter_plugin(inputs: List[Item]) -> List[FilterResponse]:
    return [FilterResponse(valid=i.data['rand']<0.9, data={'rand':i.data['rand']}) for i in inputs]

In [None]:
def get_input_batch(n_queries: int=5, size: int=64):
    np.random.seed(40)
    initial_queries = np.random.randn(n_queries, size)
    input_batch = build_batch_from_embeddings(initial_queries)
    return input_batch

Here we create the dataset and an input batch

In [None]:
dataset = get_dataset(n_vectors=10000)

100%|█████████████████████████████████████████| 10/10 [00:00<00:00, 1100.84it/s]


In [None]:
input_batch = get_input_batch()
type(input_batch)

emb_opt.schemas.Batch

## Batch



In [None]:
# filter