In [1]:
import json

with open('documents.json','rt') as f_in:
    docs_raw = json.load(f_in)

In [2]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [3]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
# Create embeddings using pretrained models

In [5]:
from sentence_transformers import SentenceTransformer

In [6]:
model= SentenceTransformer('all-mpnet-base-v2')

In [7]:
len(model.encode('my name is slim shady'))

768

In [8]:
#creating embeddings for dataset

In [9]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [10]:
operations = []
for doc in documents:
    doc['text_vector'] = model.encode(doc['text']).tolist()
    operations.append(doc)

In [11]:
from elasticsearch import Elasticsearch

In [12]:
es_client = Elasticsearch('http://localhost:9200')

In [13]:
es_client.info()

ObjectApiResponse({'name': 'fdf7bc2d2ac8', 'cluster_name': 'docker-cluster', 'cluster_uuid': '63trK7glTfisMOouvLyMxA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [14]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"}, 
            "text_vector" : {"type": "dense_vector", "dims": 768, "index":True, "similarity": "cosine"
        },
        }
    }
}

In [15]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [16]:
#Add document to index

In [17]:
for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [18]:
#create end user query

In [19]:
search_term = "which OS do i need?"
vector_search_term = model.encode(search_term)

In [20]:
query = {
    "field" : "text_vector",
    "query_vector" : vector_search_term,
    "k" : 5,
    "num_candidates" : 10000,
}

In [21]:
res = es_client.search(index=index_name, knn=query, source=["text","section","question","course"])
res["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': '4fOYL5EBZymKRyeq6xsS',
  '_score': 0.7768269,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}},
 {'_index': 'course-questions',
  '_id': 'DvOZL5EBZymKRyeqMB4_',
  '_score': 0.6482829,
  '_source': {'question': 'How to install WSL on Windows 10 and 11 ?',
   'course': 'machine-learning-zoomcamp',
   'section': '5. Deploying Machine Learning Models',
   'text': 'It is quite simple, and you can follow these instructions here:\nhttps://www.youtube.com/watch?v=qYlgUDKKK5A&ab_channel=NeuralNine\nMake sure that you have “Virtual Machine Platform” feature activated in your Windows “Features”. To do that, search “features” in the research bar and see if the checkbox is selected. You also need to make su

In [22]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [27]:
response = es_client.search(
    index=index_name,
    query={
        "match": {'course': 'data-engineering-zoomcamp'},
    },
    knn=knn_query,
    size=5,
    explain=True
)

In [29]:
response['hits']['hits']

[{'_shard': '[course-questions][0]',
  '_node': 'da3j9RLvSDq8gSN-ehVB0w',
  '_index': 'course-questions',
  '_id': '4fOYL5EBZymKRyeq6xsS',
  '_score': 1.5557408,
  '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully',
   'section': 'General course-related questions',
   'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'text_vector': [-0.026965461671352386,
    -0.000626126304268837,
    -0.01662949100136757,
    0.05285150930285454,
    0.05476527288556099,
    -0.03133990615606308,
    0.029942581430077553,
    -0.04808562621474266,
    0.04467551037669182,
    0.005839474033564329,
    0.016233040019869804,
    0.012001154012978077,
    -0.031222281977534294,
    0.016600528731942177,
    -0.04886901378631592,
    -0.06496307998895645,
    0.046434223651885986,
    -0.009297756478190422,
    -0.0642528235912323,
    -0.01373267825692892,
