# Initialization

In [22]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [2]:
import os
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [3]:
query = "What is Land encroachment?"

opensearch_url = "http://52.90.117.26:9200"
auth = ("admin", "Severus11#")
index_name = "lao"
k = 5

In [4]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'), model="text-embedding-ada-002")

# k-NN Search

In [5]:
from langchain_community.vectorstores import OpenSearchVectorSearch
client = OpenSearchVectorSearch(
    embedding_function=embeddings,
    index_name=index_name,
    opensearch_url=opensearch_url,
    http_auth=auth,
    use_ssl = False,
    verify_certs = False,
    timeout=300,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

In [6]:
knndocs = client.similarity_search(
    query,
    search_type="painless_scripting",
    space_type="cosineSimilarity",
    k=k
)
logger.info(knndocs[:2])

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:opensearch:POST http://localhost:9200/lao/_search [status:200 request:0.588s]
INFO:__main__:[Document(page_content='Land Law of Lao PDR No.70/NA dated 21 June 2019 \nUnofficial translation supported by LIWG and GIZ Land Program\n \n \n 48 \n3. Having competent and technical staff with at least three years of \nexperiences with certified practical land-related work. \n \nArticle 158 (new) Land Business Operations \nIndividuals, legal entities or organizations wishing to operate land \nbusinesses shall submit the application together with supporting documents to the \nrelevant sectors as indicated in the Law on Investment Promotion and shall request \ntechnical authorization from the Natural Resources and Environment Sector. \n \nPart XI \nProhibitions \n \nArticle 159. (new) Prohibitions for Individuals, Legal Entities or Organizations \nIndividuals, legal entities or organizations, are prohibited 

# Neural Search

In [7]:
neural_search_query = {
    'size': k,
    'query': {
        "neural": {
            "bert_embeddings": {
                "query_text": query,
                "model_id": "vERhsZABary_bsUAiOG2",
                "k": k
            }
        }
    }
}


In [8]:
from langchain.schema.document import Document

neural_response = client.client.search(
    body = neural_search_query,
    index = index_name
)
neuraldocs = []
for item in neural_response['hits']['hits']:
    neuraldocs.append(Document(page_content=item['_source']['text'], metadata=item['_source']['metadata']))

logger.info(neuraldocs[:2])

INFO:opensearch:POST http://localhost:9200/lao/_search [status:200 request:3.005s]
INFO:__main__:[Document(page_content='Land Law of Lao PDR No.70/NA dated 21 June 2019 \nUnofficial translation supported by LIWG and GIZ Land Program\n \n \n \n1 \n \nThe State grants the rights to armed forces, The Party and State \norganizations, Lao Front for National Development, Lao Federation of Veterans, \nmass organizations to manage and use State land. \n  \nAliens, stateless persons, foreign individuals, foreign nationals of Lao \nancestry have the rights to lease, receive concession of State land or purchase \nallocated State land use rights with determined timeframe and to lease the land of \nLao citizens. Their organizations that have been established with the authorization \nof the State, have the right only to lease or receive concession of State land and \nlease land of Lao citizens. \n \nThe State acknowledges the right to use surface land only, while all \nunderground and surface natura

# Neural Sparse Search

In [9]:
neural_sparse_search_query = {
    'size': k,
    'query': {
        "neural_sparse": {
            "oss_sparse_embeddings": {
                "query_text": query,
                "model_id": "vURhsZABary_bsUAieFJ"
            }
        }
    }
}


In [10]:
from langchain.schema.document import Document

neural_sparse_response = client.client.search(
    body = neural_sparse_search_query,
    index = index_name
)
neuralsparsedocs = []
for item in neural_sparse_response['hits']['hits']:
    neuralsparsedocs.append(Document(page_content=item['_source']['text'], metadata=item['_source']['metadata']))

logger.info(neuralsparsedocs[:2])

INFO:opensearch:POST http://localhost:9200/lao/_search [status:200 request:6.083s]
INFO:__main__:[Document(page_content='Land Law of Lao PDR No.70/NA dated 21 June 2019 \nUnofficial translation supported by LIWG and GIZ Land Program\n \n \n \n5 \n \nArticle 7. (amended) Protection of the Rights and Interests of the Holder of Land  \nUse Rights \nThe State protects the rights and lawful interests of the holder of land use \nrights including assets on the land surface in an effective, peaceful, regular and \nlong-term manner. \n \nArticle 8. (amended) Unauthorized Land Possession and Encroachment \nThe States does not allow individuals, legal entities and organizations to \ntake possession of and to encroach onto land. \n \nThe illegal possession and encroachment of lands are subject to legal \nsanctions and confiscation of the land. In case there are buildings or activities on \nthe land, the buildings shall be demolished and the activities shall be ceased \nwithout any compensation fro

# Keyword Search - BM25(tf-idf)

In [11]:
keyword_search_query = {
    'size': k,
    'query': {
        "match": {
            "text": {
                "query": query,
                "analyzer": "english"
            }
        }
    }
}


In [12]:
from langchain.schema.document import Document

keyword_response = client.client.search(
    body = keyword_search_query,
    index = index_name
)
keyworddocs = []
for item in keyword_response['hits']['hits']:
    keyworddocs.append(Document(page_content=item['_source']['text'], metadata=item['_source']['metadata']))

logger.info(keyworddocs[:2])

INFO:opensearch:POST http://localhost:9200/lao/_search [status:200 request:0.134s]
INFO:__main__:[Document(page_content='Land Law of Lao PDR No.70/NA dated 21 June 2019 \nUnofficial translation supported by LIWG and GIZ Land Program\n \n \n \n5 \n \nArticle 7. (amended) Protection of the Rights and Interests of the Holder of Land  \nUse Rights \nThe State protects the rights and lawful interests of the holder of land use \nrights including assets on the land surface in an effective, peaceful, regular and \nlong-term manner. \n \nArticle 8. (amended) Unauthorized Land Possession and Encroachment \nThe States does not allow individuals, legal entities and organizations to \ntake possession of and to encroach onto land. \n \nThe illegal possession and encroachment of lands are subject to legal \nsanctions and confiscation of the land. In case there are buildings or activities on \nthe land, the buildings shall be demolished and the activities shall be ceased \nwithout any compensation fro

# Search with Search Pipeline

In [13]:
query_vector = embeddings.embed_query(query)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [14]:
knn_search_query = {
    "script_score": {
        "query": {
            "match": {
                "metadata.file_name": "lao.pdf"
            }
        },
        "script": {
            "source": "1.0 + cosineSimilarity(params.query_value, doc[params.field])",
            "params": {
              "field": "vector_field",
              "query_value": query_vector
            }
        }
    }
}
neural_search_query = {
    "neural": {
        "bert_embeddings": {
            "query_text": query,
            "model_id": "vERhsZABary_bsUAiOG2",
            "k": k
        }
    }
}
neural_sparse_search_query = {
    "neural_sparse": {
        "oss_sparse_embeddings": {
            "query_text": query,
            "model_id": "vURhsZABary_bsUAieFJ"
        }
    }
}
keyword_search_query = {
    "match": {
        "text": {
            "query": query,
            "analyzer": "english"
        }
    }
}

reranker = {
    "rerank": {
      "query_context": {
         "query_text": query
      }
    }
}

compound_query = {
    "size": k,
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "metadata.file_name": "lao.pdf"
                    }
                },
                {
                    "bool": {
                        "should": [
                            knn_search_query,
                            neural_search_query,
                            neural_sparse_search_query,
                            keyword_search_query
                        ]
                    }
                }
            ]
        }
    },
    "ext": reranker
}

In [15]:
from langchain.schema.document import Document

response = client.client.search(
    body = compound_query,
    index = index_name,
    params = {
        'search_pipeline': 'default_search_pipeline'
    }
)
reranked_docs = []
for item in response['hits']['hits']:
    reranked_docs.append(Document(page_content=item['_source']['text'], metadata=item['_source']['metadata']))

logger.info(reranked_docs[:2])

INFO:opensearch:POST http://localhost:9200/lao/_search?search_pipeline=default_search_pipeline [status:200 request:4.458s]
INFO:__main__:[Document(page_content='Land Law of Lao PDR No.70/NA dated 21 June 2019 \nUnofficial translation supported by LIWG and GIZ Land Program\n \n \n \n5 \n \nArticle 7. (amended) Protection of the Rights and Interests of the Holder of Land  \nUse Rights \nThe State protects the rights and lawful interests of the holder of land use \nrights including assets on the land surface in an effective, peaceful, regular and \nlong-term manner. \n \nArticle 8. (amended) Unauthorized Land Possession and Encroachment \nThe States does not allow individuals, legal entities and organizations to \ntake possession of and to encroach onto land. \n \nThe illegal possession and encroachment of lands are subject to legal \nsanctions and confiscation of the land. In case there are buildings or activities on \nthe land, the buildings shall be demolished and the activities shall 

In [16]:
import requests
import json

def unique_by_page_content(docs):
    unique = set()
    return [unique.add(d.page_content) or d for d in docs if d.page_content not in unique]

def rerank(query, docs):
    try:
        header = {
            'Content-Type': 'application/json'
        }
        body = {
            'query_text': query,
            'text_docs': [d.page_content for d in docs]
        }
        response = None
        url = f"{opensearch_url}/_plugins/_ml/models/wURmsZABary_bsUAq-Hf/_predict"
        response = requests.post(url, data=json.dumps(body), headers=header)
        inference_result = response.json()['inference_results']
        return [r['output'][0]['data'][0] for r in inference_result]
    except Exception as ex:
        logger.error(f"exception: {str(ex)}")
        raise ex

In [17]:
import operator
from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain.schema.document import Document

client = OpenSearchVectorSearch(
    embedding_function=embeddings,
    index_name=index_name,
    opensearch_url=opensearch_url,
    http_auth=auth,
    use_ssl = False,
    verify_certs = False,
    timeout=300,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)
knn_docs = client.similarity_search(
    query,
    search_type="painless_scripting",
    space_type="cosineSimilarity",
    k=k
)

neural_search_query = {
    'size': k,
    'query': {
        "neural": {
            "bert_embeddings": {
                "query_text": query,
                "model_id": "vERhsZABary_bsUAiOG2",
                "k": k
            }
        }
    }
}

neural_response = client.client.search(
    body = neural_search_query,
    index = index_name
)
neural_docs = []
for item in neural_response['hits']['hits']:
    neural_docs.append(Document(page_content=item['_source']['text'], metadata=item['_source']['metadata']))

neural_sparse_search_query = {
    'size': k,
    'query': {
        "neural_sparse": {
            "oss_sparse_embeddings": {
                "query_text": query,
                "model_id": "vURhsZABary_bsUAieFJ"
            }
        }
    }
}
neural_sparse_response = client.client.search(
    body = neural_sparse_search_query,
    index = index_name
)
neurals_parse_docs = []
for item in neural_sparse_response['hits']['hits']:
    neurals_parse_docs.append(Document(page_content=item['_source']['text'], metadata=item['_source']['metadata']))

keyword_search_query = {
    'size': k,
    'query': {
        "match": {
            "text": {
                "query": query,
                "analyzer": "english"
            }
        }
    }
}
keyword_response = client.client.search(
    body = keyword_search_query,
    index = index_name
)
keyword_docs = []
for item in keyword_response['hits']['hits']:
    keyword_docs.append(Document(page_content=item['_source']['text'], metadata=item['_source']['metadata']))

all_docs = [*knn_docs, *neural_docs, *neurals_parse_docs, *keyword_docs]
logger.info(f"total docs count - {len(all_docs)}")
unique_docs = unique_by_page_content(all_docs)
logger.info(f"unique docs count - {len(unique_docs)}")

reranker_result = rerank(query, unique_docs)
logger.info(f"reranker result - {reranker_result}")

sorted_result = sorted(zip(unique_docs, reranker_result), key=operator.itemgetter(1), reverse=True)
result = [t[0] for t in sorted_result][:k]
logger.info(result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:opensearch:POST http://localhost:9200/lao/_search [status:200 request:0.183s]
INFO:opensearch:POST http://localhost:9200/lao/_search [status:200 request:0.117s]
INFO:opensearch:POST http://localhost:9200/lao/_search [status:200 request:0.313s]
INFO:opensearch:POST http://localhost:9200/lao/_search [status:200 request:0.081s]
INFO:__main__:total docs count - 20
INFO:__main__:unique docs count - 14
INFO:__main__:reranker result - [-6.2798567, -0.52399004, 0.102317385, -7.8649077, -9.2079115, -8.035671, -8.764602, -8.35333, -8.473446, -7.233333, -8.706775, -9.509703, -9.403649, -8.588062]
INFO:__main__:[Document(page_content='Land Law of Lao PDR No.70/NA dated 21 June 2019 \nUnofficial translation supported by LIWG and GIZ Land Program\n \n \n \n5 \n \nArticle 7. (amended) Protection of the Rights and Interests of the Holder of Land  \nUse Rights \nThe State protects the rights and lawful interests o