In [2]:
import json
from tqdm import tqdm

In [42]:
with open("data/all_qa.json") as file:
    intents = json.load(file)

In [31]:
from collections import defaultdict


hashes = defaultdict(list)

for doc in intents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

print(len(hashes), len(intents))

322 546


In [4]:
from dotenv import load_dotenv


load_dotenv()

True

In [5]:
import torch


print(torch.cuda.is_available())

False


# Add intents to ElasticSearch Index

In [6]:
from src.client_modules.embeddings.transformer import TransformerEmbeddingModel


emb_model = TransformerEmbeddingModel()

  from tqdm.autonotebook import tqdm, trange


In [28]:
embedding_text_template = ("""Equivalent questions: \"""{patterns}\"""
Equivalent answers:\"""{responses}\"""
""")


for intent in tqdm(intents):
    embedding_text = embedding_text_template.format(
        patterns='\n\n'.join(intent["patterns"]),
        responses='\n\n"'.join(intent["responses"])
    )
    intent['text'] = embedding_text
    intent['vector_field'] = emb_model.get_embeddings([embedding_text])[0].tolist()

100%|██████████| 546/546 [01:32<00:00,  5.93it/s]


In [8]:
from src.client_modules.elastic_search.elastic_search_client import ElasticSearchClient


es_client = ElasticSearchClient(port=9201)

index_name = "cs-theory"

In [11]:
len(intents)

546

In [14]:
print(intents[0].keys())

dict_keys(['tag', 'patterns', 'responses', 'document', 'text', 'vector_field'])


## Mappings and Index

You do not need to specify in ElasticSearch a different type to indicate that they will hold arrays.
https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-types.html#types-array-handling

In [12]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "patterns": {"type": "text"},
            "responses": {"type": "text"},
            "document": {"type": "text"},
            "text": {"type": "text"} ,
            "vector_field": {"type": "dense_vector", "dims": 768,
                              "index": True, "similarity": "cosine"},
        }
    }
}

es_client.create_index(index_name=index_name, index_settings=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'cs-theory'})

## Add documents into index

In [13]:
try:
    es_client.index_documents(index_name=index_name, documents=intents)
except Exception as e:
    print(e)

100%|██████████| 546/546 [00:29<00:00, 18.26it/s]


# ElasticSearch Query

In [None]:
from src.client_modules.embeddings.transformer import TransformerEmbeddingModel 

### Elastic search connection

In [None]:
from src.client_modules.elastic_search.elastic_search_client import ElasticSearchClient


es_client = ElasticSearchClient(port=9201)

## Embeddings

In [18]:
question = "Tell me what is a syntax error"
emb_model = TransformerEmbeddingModel()
vector_search_term = emb_model.get_embeddings([question])[0]

## Query

In [19]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000, 
}

In [23]:
knn = {
    "field": "vector_field",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000,
    # "filter": {
    #     "term": {
    #         "course": course
    #     }
    # }
}

search_query = {
    "knn": knn,
    "_source": ["text", "patterns", "responses", "document", "vector_field"]  # The vars that will be included in the output
}

In [24]:
res = es_client.search(index_name=index_name, search_query=search_query)
res["hits"]["hits"]

[{'_index': 'cs-theory',
  '_id': 'EdLmOZIBOaYZkJ3YRIU8',
  '_score': 0.830681,
  '_source': {'patterns': ['What is a syntax error',
    'Explain syntax error',
    'Why syntax error occurs?'],
   'responses': ["A syntax error is an error in the structure of a programming language's code. It occurs when the code does not conform to the rules of the programming language's syntax. Syntax errors can occur for a variety of reasons, such as using the wrong punctuation, omitting required elements, or using an incorrect keyword. Syntax errors are usually detected by a compiler or interpreter when the code is being compiled or executed, and they can prevent the code from running correctly. To fix a syntax error, the code must be revised to conform to the correct syntax of the programming language."],
   'text': 'Equivalent questions: """What is a syntax error\nExplain syntax error\nWhy syntax error occurs?"""\nEquivalent answers:"""A syntax error is an error in the structure of a programming l