In [None]:
import torch
from datasets import load_dataset
from transformers import T5Model, T5Tokenizer

from elasticsearch import Elasticsearch

# Create an elasticsearch index

You have to have a running local cluster before continuing this part.

In [None]:
ELASTIC_PASSWORD = "E_zn77Jg3pct5db8irFU"
ES_HOST = "https://localhost:9200/"
index_name = "semantic-search"

In [None]:
# Create the client instance
client = Elasticsearch(
    hosts=ES_HOST,
    ca_certs='./http_ca.crt',
    basic_auth=("elastic", ELASTIC_PASSWORD)
)


In [None]:
# get cluster information
client.info()

In [None]:
# define index config
config = {
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "embeddings": {
                    "type": "dense_vector",
                    "dims": 512,
                    "index": True
                }
            }
    },
    "settings": {
        "number_of_shards": 2,
        "number_of_replicas": 1
    }
}

# create an index in elasticsearch
try:
    client.indices.create(
        index=index_name,
        settings=config["settings"],
        mappings=config["mappings"],
    )
except:
    print(f"Index already exists: {client.indices.exists(index=[index_name])}")

# Create embeddings

In [None]:
# load the dataset
dataset = load_dataset('newsgroup', '18828_alt.atheism')
# check an example of data
dataset['train'][0]['text']

# TODO: turn this to batch

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5Model.from_pretrained("t5-small")

In [None]:
def get_embeddings(input_text, model=model, tokenizer=tokenizer, max_length=512):
        
#     tokenizer = T5Tokenizer.from_pretrained(model_name)
#     model = T5Model.from_pretrained(model_name)

    inputs = tokenizer.encode_plus(input_text, 
                                         max_length=max_length,
                                         pad_to_max_length=True,
                                         return_tensors="pt")
    
    outputs = model(input_ids=inputs['input_ids'], decoder_input_ids=inputs['input_ids'])
    
    last_hidden_states = torch.mean(outputs[0], dim=1)

    return last_hidden_states.tolist()

In [None]:
small_dataset = dataset['train'].select(range(10))

In [None]:
# index documents and their embedding in Elasticsearch
for i in range(small_dataset.num_rows):
    doc = {"text": small_dataset['text'][i],
           "embeddings": get_embeddings(small_dataset['text'][i])[0]
    }
    
    client.index(index= index_name, document=doc)

In [None]:
result = client.count(index=index_name)
print(result.body['count'])

# Search!

After indexing is finished, we can search our data. Elasticsearch uses cosine similarity but also provides a python wrapper to perform KNN search. You also have the option to use a custom similarity function.

I have provided a code snippet for KNN with k=5. Remember that you have to provide embeddings for your query term as well.

In [None]:
query_embedding = get_embeddings(dataset['train']['text'][20])[0]
query_dict = {
    "field": "embeddings",
    "query_vector": query_embedding,
    "k": 5,
    "num_candidates": 5
}
res = client.knn_search(index=index_name, knn=query_dict, source=["text"])