In [69]:
import json
from tqdm import tqdm

In [70]:
with open("data/all_qa.json") as file:
    intents = json.load(file)

In [None]:
from collections import defaultdict


hashes = defaultdict(list)

for doc in intents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

print(len(hashes), len(intents))

In [None]:
from dotenv import load_dotenv


load_dotenv()

In [None]:
import torch


print(torch.cuda.is_available())

# Add intents to ElasticSearch Index

In [None]:
from src.client_modules.embeddings.transformer import TransformerEmbeddingModel


emb_model = TransformerEmbeddingModel()

In [None]:
embedding_text_template = ("""Equivalent questions: \"""{patterns}\"""
Equivalent answers:\"""{responses}\"""
""")


for intent in tqdm(intents):
    embedding_text = embedding_text_template.format(
        patterns='\n\n'.join(intent["patterns"]),
        responses='\n\n"'.join(intent["responses"])
    )
    intent['text'] = embedding_text
    intent['vector_field'] = emb_model.get_embeddings([embedding_text])[0].tolist()

In [80]:
import numpy as np
import tiktoken


def get_text_embedding(emb_model, embedding_text):
    embedding = emb_model.get_embeddings([embedding_text])[0]
    if isinstance(embedding, np.ndarray):
        embedding = embedding.tolist()
    return embedding

def get_combined_qa_embedding(emb_model, intent):
    embedding_text_template = ("""Question: {patterns}\nAnswer:{responses}""")

    embedding_text = embedding_text_template.format(
        patterns=intent["patterns"][0],
        responses=intent["responses"][0]
    )

    embedding = get_text_embedding(emb_model, embedding_text)

    full_text = ("""Questions:\n- {'\n- '.join(intent["patterns"])}\nAnswers:\n- {'\n- '.join(intent["responses"]}""")

    return full_text, embedding_text, embedding

def get_questions_embedding(emb_model, intent):

    full_text = '\n- '.join(intent["patterns"])
    embedding_text = intent["patterns"][0]
    embedding = get_text_embedding(emb_model, embedding_text)
    return full_text, embedding_text, embedding

def get_answers_embedding(emb_model, intent):
    full_text = '\n- '.join(intent["responses"])
    embedding_text = intent["responses"][0]
    embedding = get_text_embedding(emb_model, embedding_text)
    return full_text, embedding_text, embedding

In [None]:
from src.client_modules.embeddings.azure_openai import AzureOpenAIEmbeddingModel


emb_model = AzureOpenAIEmbeddingModel()

es_docs = []


for intent in tqdm(intents[:10]):
    es_doc = {}
    questions_full, questions_text, questions_embedding = get_questions_embedding(emb_model, intent)
    es_doc["questions"] = questions_full
    es_doc["questions_vector_text"] = questions_text
    es_doc["questions_vector"] = questions_embedding

    answers_full, answers_text, answers_embedding = get_answers_embedding(emb_model, intent)
    es_doc["answers"] = answers_full
    es_doc["answers_vector_text"] = answers_text
    es_doc["answers_vector"] = answers_embedding

    comb_full, comb_text, comb_embedding = get_combined_qa_embedding(emb_model, intent)
    es_doc["combined_qa"] = comb_full
    es_doc["combined_qa_vector_text"] = comb_text
    es_doc["combined_qa_vector"] = comb_embedding

    es_doc["id"] = intent["id"]
    es_doc["document"] = intent["document"]

    es_docs.append(es_doc)

In [135]:
from src.client_modules.elastic_search.elastic_search_client import ElasticSearchClient


es_client = ElasticSearchClient(port=9200)

index_name = "cs-theory"

## Mappings and Index

You do not need to specify in ElasticSearch a different type to indicate that they will hold arrays.
https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-types.html#types-array-handling

In [None]:
dimension = 1536 # len(es_docs[0]["combined_qa_vector"])
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "id": {"type": "text"},
            "document": {"type": "text"},
            "questions": {"type": "text"},
            "answers": {"type": "text"},
            "combined_qa": {"type": "text"},
            "questions_vector_text": {"type": "text"},
            "answers_vector_text": {"type": "text"},
            "combined_qa_vector_text": {"type": "text"},
            "combined_qa_vector": {"type": "dense_vector", "dims": dimension},
            "questions_vector": {"type": "dense_vector", "dims": dimension},
            "answers_vector": {"type": "dense_vector", "dims": dimension},
        }
    }
}

es_client.create_index(index_name=index_name, index_settings=index_settings)

## Add documents into index

In [None]:
es_client.index_documents(index_name=index_name, documents=es_docs[0:1])

# ElasticSearch Query

In [17]:
from src.client_modules.embeddings.transformer import TransformerEmbeddingModel 

### Elastic search connection

In [214]:
from src.client_modules.elastic_search.elastic_search_client import ElasticSearchClient


es_client = ElasticSearchClient(port=9200)

## Embeddings

In [216]:
question = "Tell me what is a syntax error"

emb_model = TransformerEmbeddingModel()
emb_model = AzureOpenAIEmbeddingModel()
vector_search_term = emb_model.get_embeddings([question])[0]

## Query

In [227]:
search_query1 = {
  "query": {
    "script_score": {
      "query": {
        "match_all": {}
      },
      "script": {
        "source": "dotProduct(params.query_vector, 'combined_qa_vector') + 1.0",
        "params": {
          "query_vector": vector_search_term
        }
      }
    }
  }
}

search_query2 = {
  "query": {
    "script_score": {
      "query": { "match_all": {} },
      "script": {
        "source": """
          0.5 * dotProduct(params.query_vector, 'combined_qa_vector') +
          0.25 * dotProduct(params.query_vector, 'answers_vector') +
          0.25 * dotProduct(params.query_vector, 'questions_vector')
        """,
        "params": {
          "query_vector": vector_search_term
        }
      }
    }
  }
}

search_query3 = {
  "query": {
    "script_score": {
      "query": { "match_all": {} },
      "script": {
        "source": """
          (dotProduct(params.query_vector, 'combined_qa_vector') +
           dotProduct(params.query_vector, 'answers_vector') +
           dotProduct(params.query_vector, 'questions_vector')) / 3
        """,
        "params": {
          "query_vector": vector_search_term
        }
      }
    }
  }
}


search_query4 = {
  "query": {
    "script_score": {
      "query": { "match_all": {} },
      "script": {
        "source": """
          Math.max(
            dotProduct(params.query_vector, 'combined_qa_vector'),
            dotProduct(params.query_vector, 'answers_vector'),
            dotProduct(params.query_vector, 'questions_vector')
          )
        """,
        "params": {
          "query_vector": vector_search_term
        }
      }
    }
  }
}


In [None]:
res = es_client.search(index_name=index_name, search_query=search_query1)
res["hits"]["hits"]