In [1]:
!pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-8.6.1-py3-none-any.whl (385 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m385.4/385.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting elastic-transport<9,>=8
  Using cached elastic_transport-8.4.0-py3-none-any.whl (59 kB)
Installing collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.4.0 elasticsearch-8.6.1


In [1]:
!curl elasticsearch:9200

{
  "name" : "91afebcfa735",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "ilew_pR7RhqgqmPQv8pb2Q",
  "version" : {
    "number" : "8.5.3",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "4ed5ee9afac63de92ec98f404ccbed7d3ba9584e",
    "build_date" : "2022-12-05T18:22:22.226119656Z",
    "build_snapshot" : false,
    "lucene_version" : "9.4.2",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}


In [2]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch(
    hosts=[{"scheme": "http", "host": "elasticsearch", "port": 9200}],
)

In [3]:
es_client.info()

ObjectApiResponse({'name': '91afebcfa735', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'ilew_pR7RhqgqmPQv8pb2Q', 'version': {'number': '8.5.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '4ed5ee9afac63de92ec98f404ccbed7d3ba9584e', 'build_date': '2022-12-05T18:22:22.226119656Z', 'build_snapshot': False, 'lucene_version': '9.4.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

# Building a dummy dataset

In [4]:
import pandas as pd
df = pd.DataFrame([[1, 'A', [1, 0, 1, 1]],
                   [2, 'B', [1, 0, 1, 1]],
                   [3, 'C', [1, 0, 1, 1]],
                   [4, 'A', [1, 1, 1, 1]],
                   [5, 'A', [1, 0.5, 1, 1]],
                   [6, 'A', [1, 0.1, 1, 1]],
                  ], columns=['shop_id', 'attribute_1', 'vector'])
df

Unnamed: 0,shop_id,attribute_1,vector
0,1,A,"[1, 0, 1, 1]"
1,2,B,"[1, 0, 1, 1]"
2,3,C,"[1, 0, 1, 1]"
3,4,A,"[1, 1, 1, 1]"
4,5,A,"[1, 0.5, 1, 1]"
5,6,A,"[1, 0.1, 1, 1]"


# Index data to Elasticsearch

## Create an index

In [11]:
try:
    es_client.indices.delete(index=index_name)
except Exception as e:
    print(e)

NotFoundError(404, 'index_not_found_exception', 'no such index [shops]', shops, index_or_alias)


In [12]:
index_name = 'shops'
settings = {
    "number_of_shards": 1,
    "number_of_replicas": 1
}
mappings = {
    'properties': {
        'shop_id': {'type': 'long'},
        # 'attribute_1': {'index': 'not_analyzed', 'format': 'dateOptionalTime', 'type': 'date'},
        'attribute_1': {'type': 'keyword'},
        'embedding': {'type': 'dense_vector', 
                      'dims': 4,
                      'index': True,
                      # 'similarity': 'dot_product'
                      'similarity': 'cosine'
                     },
}}
request_body = {
    "settings" : settings,
    'mappings': mappings
}
es_client.indices.create(index=index_name, settings=settings, mappings=mappings)
# es_client.indices.create(index=index_name, body=request_body)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'shops'})

## Bulk Insert of docs

In [13]:
actions = []
for i, row in enumerate(df.iterrows()):
    action = {'index': {'_index': index_name, '_id': i}}
    doc = {
        'shop_id': row[1].shop_id,
        'attribute_1': row[1].attribute_1,
        'embedding': row[1].vector
    }
    actions.append(action)
    actions.append(doc)
es_client.bulk(index=index_name, operations=actions)

ObjectApiResponse({'took': 7, 'errors': False, 'items': [{'index': {'_index': 'shops', '_id': '0', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'shops', '_id': '1', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'shops', '_id': '2', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'shops', '_id': '3', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'shops', '_id': '4', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 4, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 's

In [14]:
# check all docs have been inserted
result = es_client.count(index=index_name)
print(result.body['count'])

6


## Query all

In [15]:
results = es_client.search(index=index_name, query={'match_all': {}})
for result in results['hits']['hits']:
    print(result)

{'_index': 'shops', '_id': '0', '_score': 1.0, '_source': {'shop_id': 1, 'attribute_1': 'A', 'embedding': [1, 0, 1, 1]}}
{'_index': 'shops', '_id': '1', '_score': 1.0, '_source': {'shop_id': 2, 'attribute_1': 'B', 'embedding': [1, 0, 1, 1]}}
{'_index': 'shops', '_id': '2', '_score': 1.0, '_source': {'shop_id': 3, 'attribute_1': 'C', 'embedding': [1, 0, 1, 1]}}
{'_index': 'shops', '_id': '3', '_score': 1.0, '_source': {'shop_id': 4, 'attribute_1': 'A', 'embedding': [1, 1, 1, 1]}}
{'_index': 'shops', '_id': '4', '_score': 1.0, '_source': {'shop_id': 5, 'attribute_1': 'A', 'embedding': [1, 0.5, 1, 1]}}
{'_index': 'shops', '_id': '5', '_score': 1.0, '_source': {'shop_id': 6, 'attribute_1': 'A', 'embedding': [1, 0.1, 1, 1]}}


## KNN Search

In [16]:
query_vector = [1, 0.11, 1, 1]

### Brute Force KNN
#### Without Filter

In [17]:
query = {
    'script_score': {
        'query' : {'match_all': {}},
        'script': {
            "source": "cosineSimilarity(params.queryVector, 'embedding')",
            "params": {
              "queryVector": query_vector
            }
    }
    },
    }
results = es_client.search(query=query)
for result in results['hits']['hits']:
    print(result)

{'_index': 'shops', '_id': '5', '_score': 0.99998343, '_source': {'shop_id': 6, 'attribute_1': 'A', 'embedding': [1, 0.1, 1, 1]}}
{'_index': 'shops', '_id': '0', '_score': 0.9979894, '_source': {'shop_id': 1, 'attribute_1': 'A', 'embedding': [1, 0, 1, 1]}}
{'_index': 'shops', '_id': '1', '_score': 0.9979894, '_source': {'shop_id': 2, 'attribute_1': 'B', 'embedding': [1, 0, 1, 1]}}
{'_index': 'shops', '_id': '2', '_score': 0.9979894, '_source': {'shop_id': 3, 'attribute_1': 'C', 'embedding': [1, 0, 1, 1]}}
{'_index': 'shops', '_id': '4', '_score': 0.976416, '_source': {'shop_id': 5, 'attribute_1': 'A', 'embedding': [1, 0.5, 1, 1]}}
{'_index': 'shops', '_id': '3', '_score': 0.89597464, '_source': {'shop_id': 4, 'attribute_1': 'A', 'embedding': [1, 1, 1, 1]}}


#### With Filter

In [18]:
query = {
    'script_score': {
        'query' : {'bool': {'filter': {'term': {'attribute_1': 'A'}}}},
        'script': {
            "source": "cosineSimilarity(params.queryVector, 'embedding')",
            "params": {
              "queryVector": query_vector
            }
    }
    },
    }
results = es_client.search(query=query)
for result in results['hits']['hits']:
    print(result)

{'_index': 'shops', '_id': '5', '_score': 0.99998343, '_source': {'shop_id': 6, 'attribute_1': 'A', 'embedding': [1, 0.1, 1, 1]}}
{'_index': 'shops', '_id': '0', '_score': 0.9979894, '_source': {'shop_id': 1, 'attribute_1': 'A', 'embedding': [1, 0, 1, 1]}}
{'_index': 'shops', '_id': '4', '_score': 0.976416, '_source': {'shop_id': 5, 'attribute_1': 'A', 'embedding': [1, 0.5, 1, 1]}}
{'_index': 'shops', '_id': '3', '_score': 0.89597464, '_source': {'shop_id': 4, 'attribute_1': 'A', 'embedding': [1, 1, 1, 1]}}


### Approximate KNN
#### Without filter

In [19]:
TOP_K = 10
NUM_CANDIDATES = 1000 # candidates per shard
knn = {
    'field': 'embedding',
    'query_vector': query_vector,
    'k': TOP_K,
    'num_candidates': NUM_CANDIDATES
}
results = es_client.search(knn=knn)
for result in results['hits']['hits']:
    print(result)

{'_index': 'shops', '_id': '5', '_score': 0.9999918, '_source': {'shop_id': 6, 'attribute_1': 'A', 'embedding': [1, 0.1, 1, 1]}}
{'_index': 'shops', '_id': '0', '_score': 0.9989947, '_source': {'shop_id': 1, 'attribute_1': 'A', 'embedding': [1, 0, 1, 1]}}
{'_index': 'shops', '_id': '1', '_score': 0.9989947, '_source': {'shop_id': 2, 'attribute_1': 'B', 'embedding': [1, 0, 1, 1]}}
{'_index': 'shops', '_id': '2', '_score': 0.9989947, '_source': {'shop_id': 3, 'attribute_1': 'C', 'embedding': [1, 0, 1, 1]}}
{'_index': 'shops', '_id': '4', '_score': 0.98820794, '_source': {'shop_id': 5, 'attribute_1': 'A', 'embedding': [1, 0.5, 1, 1]}}
{'_index': 'shops', '_id': '3', '_score': 0.9479873, '_source': {'shop_id': 4, 'attribute_1': 'A', 'embedding': [1, 1, 1, 1]}}


#### With Filter

In [20]:
TOP_K = 10
NUM_CANDIDATES = 1000 # candidates per shard
knn = {
    'field': 'embedding',
    'query_vector': query_vector,
    'k': TOP_K,
    'num_candidates': NUM_CANDIDATES,
    "filter": {
      "term": {'attribute_1': 'A'}
    }
}
results = es_client.search(knn=knn)
for result in results['hits']['hits']:
    print(result)

{'_index': 'shops', '_id': '5', '_score': 0.9999918, '_source': {'shop_id': 6, 'attribute_1': 'A', 'embedding': [1, 0.1, 1, 1]}}
{'_index': 'shops', '_id': '0', '_score': 0.9989947, '_source': {'shop_id': 1, 'attribute_1': 'A', 'embedding': [1, 0, 1, 1]}}
{'_index': 'shops', '_id': '4', '_score': 0.98820794, '_source': {'shop_id': 5, 'attribute_1': 'A', 'embedding': [1, 0.5, 1, 1]}}
{'_index': 'shops', '_id': '3', '_score': 0.9479873, '_source': {'shop_id': 4, 'attribute_1': 'A', 'embedding': [1, 1, 1, 1]}}


# References on Elastic

* [Dense vectors](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html)

* [KNN search](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html)

* [Vector functions](https://www.elastic.co/guide/en/elasticsearch/reference/master/query-dsl-script-score-query.html#vector-functions)


