#using elastic search




#01 Install and set up Elasticsearch

In [None]:

!pip install elasticsearch
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.10.1-linux-x86_64.tar.gz
!tar -xzf elasticsearch-7.10.1-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.10.1
!elasticsearch-7.10.1/bin/elasticsearch -d

--2024-06-13 10:08:16--  https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.10.1-linux-x86_64.tar.gz
Resolving artifacts.elastic.co (artifacts.elastic.co)... 34.120.127.130, 2600:1901:0:1d7::
Connecting to artifacts.elastic.co (artifacts.elastic.co)|34.120.127.130|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 318801277 (304M) [application/x-gzip]
Saving to: ‘elasticsearch-7.10.1-linux-x86_64.tar.gz’


2024-06-13 10:08:26 (32.5 MB/s) - ‘elasticsearch-7.10.1-linux-x86_64.tar.gz’ saved [318801277/318801277]

uncaught exception in thread [main]
java.lang.RuntimeException: can not run elasticsearch as root
	at org.elasticsearch.bootstrap.Bootstrap.initializeNatives(Bootstrap.java:111)
	at org.elasticsearch.bootstrap.Bootstrap.setup(Bootstrap.java:178)
	at org.elasticsearch.bootstrap.Bootstrap.init(Bootstrap.java:393)
	at org.elasticsearch.bootstrap.Elasticsearch.init(Elasticsearch.java:170)
	at org.elasticsearch.bootstrap.Elasticsearch.execute(E

In [None]:
import numpy as np
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import time


In [None]:
# Connect to Elasticsearch
es = Elasticsearch(['http://localhost:9200'])
# Wait for Elasticsearch to start
time.sleep(30)

In [None]:
embeddings = [np.random.rand(128) for _ in range(1000)]

# Define the index name
index_name = "embeddings"

# Define the mapping for the index
mapping = {
    "mappings": {
        "properties": {
            "embedding": {
                "type": "dense_vector",
                "dims": 128
            }
        }
    }
}



In [None]:
# Create the index
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=mapping)

def generate_actions(embeddings):
    for i, embedding in enumerate(embeddings):
        yield {
            "_index": index_name,
            "_id": i,
            "_source": {
                "embedding": embedding.tolist()
            }
        }



In [1]:
# Use bulk indexing for efficiency
bulk(es, generate_actions(embeddings))

# Define the query embedding
query_embedding = np.random.rand(128)
# Create the query
query = {
    "query": {
        "script_score": {
            "query": {
                "match_all": {}
            },
            "script": {
                "source": "cosineSimilarity(params.query_embedding, 'embedding') + 1.0",
                "params": {
                    "query_embedding": query_embedding.tolist()
                }
            }
        }
    }
}

In [None]:
# Execute the search
response = es.search(index=index_name, body=query)

In [None]:
results = response["hits"]["hits"]
nearest_neighbors = [(hit["_id"], hit["_score"]) for hit in results]

print("Nearest Neighbors:", nearest_neighbors)

In [None]:
!curl http://localhost:9200/


curl: (7) Failed to connect to localhost port 9200 after 0 ms: Connection refused


In [None]:
es = Elasticsearch()

#Create an Index in Elasticsearch

In [None]:
def create_index(index_name="sbert_embeddings"):
    settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "embedding": {"type": "dense_vector", "dims": 768}
            }
        }
    }
    es.indices.create(index=index_name, body=settings, ignore=400)
    print(f"Index {index_name} created.")


#Index Documents

In [None]:
def index_documents(documents, embeddings, index_name="sbert_embeddings"):
    actions = [
        {
            "_index": index_name,
            "_source": {
                "text": doc,
                "embedding": embedding.tolist()
            }
        }
        for doc, embedding in zip(documents, embeddings)
    ]
    helpers.bulk(es, actions)
    print("Documents indexed.")


#Query by Embedding

In [None]:
def search_by_embedding(query_embedding, index_name="sbert_embeddings", top_n=10):
    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_embedding, 'embedding') + 1.0",
                "params": {"query_embedding": query_embedding}
            }
        }
    }
    response = es.search(index=index_name, body={"query": script_query, "size": top_n})
    return response['hits']['hits']


#Example

In [None]:
documents = [
    "Hello world",
    "Elasticsearch is a search engine based on Lucene",
    "Annoy is a C++ library with Python bindings",
    "SBERT is a modification of the pretrained BERT network"
]

#embeddings generated by SBERT
embeddings = np.random.rand(len(documents), 768)


In [None]:
create_index(index_name="sbert_embeddings")
index_documents(documents, embeddings, index_name="sbert_embeddings")


In [None]:
# Example query embedding
query_text = "search engine technology"
query_embedding = np.random.rand(768)  # In practice, generate this using SBERT


search_results = search_by_embedding(query_embedding, index_name="sbert_embeddings", top_n=3)

for result in search_results:
    print(f"Document: {result['_source']['text']}, Score: {result['_score']}")


In [None]:
from sentence_transformers import SentenceTransformer

# Initialize SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

documents = [
    "Hello world",
    "Elasticsearch is a search engine based on Lucene",
    "Annoy is a C++ library with Python bindings",
    "SBERT is a modification of the pretrained BERT network"
]

# Generate embeddings
embeddings = np.array([model.encode(doc) for doc in documents])

# Create Elasticsearch index
create_index(index_name="sbert_embeddings")

# Index documents with their embeddings
index_documents(documents, embeddings, index_name="sbert_embeddings")

# Query for similar documents
query_text = "search engine technology"
query_embedding = model.encode(query_text)
search_results = search_by_embedding(query_embedding, index_name="sbert_embeddings", top_n=3)

print("Search Results:")
for result in search_results:
    print(f"Document: {result['_source']['text']}, Score: {result['_score']}")
