In [1]:
import json
with open("../documents.json", "rt") as f_in:
    docs_raw = json.load(f_in)

In [7]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc["course"] = course_dict["course"]
        documents.append(doc)

documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [8]:
from sentence_transformers import SentenceTransformer

In [None]:
 model = SentenceTransformer("all-mpnet-base-v2")

In [11]:
len(model.encode("How are you doing?"))

768

In [13]:
operations = []

for doc in documents:
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)
    
operations[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'text_vector': [-0.041030403226614,
  0.025834161788225174,
  -0.036801841109991074,
  -0.020898321643471718,
  -0.020596304908394814,
  0.009353742003440857,
  -0.003331671468913555,
  -0.009491903707385063,
  0.030117977410554886,
  0.01908210851252079,
  0.012690035626292229,
  -0.017078785225749016,
  -0.0016324761090800166,
  0.12997251749038696,
  0.030969230458140373,
  -0.025823738425970078,
  0.0278230682015419,
  0.025159770622849464,
  -0.0808122381567955,
  -0.0036173474509269,
  -0.008902025409042835,
  0.003404824063181877,
  -0.0230092890560627,
  -0.03404529020190239,
  0.024598615244030952,
  0.013545555993914604,
  -0.025439025834202766,
  0.011951087042689323,
  -0.020540112629532814,
  -0.010077380575239658,
  0.020575348287820816,
  0.0

In [14]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch("http://localhost:9200")

In [15]:
es_client.info()

{'name': '0650c37af5e0',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'vevVz-Y1RTG3PH_zLSdw2Q',
 'version': {'number': '8.4.3',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73',
  'build_date': '2022-10-04T07:17:24.662462378Z',
  'build_snapshot': False,
  'lucene_version': '9.3.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [16]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
        }
    }
}

In [17]:
index_name = "course_questions"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'course_questions'}

In [19]:
for doc in operations:
    try:
        es_client.index(index=index_name, body=doc)
    except Exception as e:
        print(e)

In [20]:
search_term = "windows or mac?"
vector_search_term = model.encode(search_term)

In [28]:
query = {
    "knn": {
        "field": "text_vector",
        "query_vector": vector_search_term,
        "k": 5, 
        "num_candidates": 10000
        },
    
    "fields": ["text","section","question","course"]
}

In [29]:
results = es_client.search(index=index_name, body=query)
results["hits"]["hits"]

[{'_index': 'course_questions',
  '_id': 'I1svoZABvhdNY38yvj3H',
  '_score': 0.7147919,
  '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully',
   'section': 'General course-related questions',
   'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'text_vector': [-0.026965461671352386,
    -0.000626126304268837,
    -0.01662949100136757,
    0.05285150930285454,
    0.05476527288556099,
    -0.03133990615606308,
    0.029942581430077553,
    -0.04808562621474266,
    0.04467551037669182,
    0.005839474033564329,
    0.016233040019869804,
    0.012001154012978077,
    -0.031222281977534294,
    0.016600528731942177,
    -0.04886901378631592,
    -0.06496307998895645,
    0.046434223651885986,
    -0.009297756478190422,
    -0.0642528235912323,
    -0.01373267825692892,
    -0.015976183116436005,
    0.008629541844129562,
    -0.02447899058461

In [30]:
query = {
    "query": {
        "match": {
            "course": 'data-engineering-zoomcamp'
        }
    },
    "knn": {
        "field": "text_vector",
        "query_vector": vector_search_term,
        "k": 5, 
        "num_candidates": 10000
        },   
    "fields": ["text","section","question","course"]
}

In [32]:
results = es_client.search(index=index_name, body=query, explain=True)
results["hits"]["hits"]

[{'_shard': '[course_questions][0]',
  '_node': 'SLBI8OkJRAGpF0d6QgRJbw',
  '_index': 'course_questions',
  '_id': 'I1svoZABvhdNY38yvj3H',
  '_score': 1.4937059,
  '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully',
   'section': 'General course-related questions',
   'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'text_vector': [-0.026965461671352386,
    -0.000626126304268837,
    -0.01662949100136757,
    0.05285150930285454,
    0.05476527288556099,
    -0.03133990615606308,
    0.029942581430077553,
    -0.04808562621474266,
    0.04467551037669182,
    0.005839474033564329,
    0.016233040019869804,
    0.012001154012978077,
    -0.031222281977534294,
    0.016600528731942177,
    -0.04886901378631592,
    -0.06496307998895645,
    0.046434223651885986,
    -0.009297756478190422,
    -0.0642528235912323,
    -0.01373267825692892,
