In [3]:
import json
from sentence_transformers import SentenceTransformer

import warnings
warnings.filterwarnings('ignore')

## Run 

sudo docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3


In [8]:
model = SentenceTransformer("all-mpnet-base-v2")
len(model.encode("This is a simple sentence"))

768

In [5]:
with open('data/parsed_book.json', 'rt') as f_in:
    book_raw = json.load(f_in)

In [None]:
book_raw

In [7]:
documents = []

for doc in book_raw: 
    for section in doc['content']:
        if 'text' in section:
            section["text_vector"] = model.encode(section["text"]).tolist()
        section['chapter'] = doc['chapter']
        section['title'] = doc['title']
        documents.append(section)


In [10]:
documents[0:3]

[{'text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer. Especially for new graduates 2 and readers coming from different industries, this chapter helps get everyone on the same page as well as clarifies the process. The interconnecting pieces of interviews are complex, with many types of combina‐ tions d

## Create embeddings using Pretrained Models

## Setup ElasticSearch connection

In [9]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

{'name': '810a7a347b3b',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'b66KeqxXTB2vOEKVbJx8WQ',
 'version': {'number': '8.4.3',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73',
  'build_date': '2022-10-04T07:17:24.662462378Z',
  'build_snapshot': False,
  'lucene_version': '9.3.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [10]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
    "properties": {
        "text": {
            "type": "text",
        },
        "section": {
            "type": "text",
        },
        "chapter": {
            "type": "text",
        },
        "title": {
            "type": "keyword",
            "null_value": "Untitled"
        },
        "text_vector": {
            "type": "dense_vector",
            "dims": 768,
            "index": True,
            "similarity": "cosine"
        }
    }
}

}


In [11]:
index_name = "ml-interview-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'ml-interview-questions'}

In [12]:
for doc in documents:
    try:
        es_client.index(index=index_name, body=doc)
    except Exception as e:
        print(f"Error when indexing the document: {e}")


## Create end user query

In [13]:
search_term = "what are the steps of ML interviews?"
vector_search_term = model.encode(search_term)

In [14]:
res = es_client.search(
    index="ml-interview-questions",
    body={
        "size": 5,  
        "knn": {
            "field": "text_vector",  
            "query_vector": vector_search_term,
            "k": 5,  
            "num_candidates": 1000  
        },
        "_source": ["text", "section", "title", "chapter"]
    }
)
res['hits']['hits']

[{'_index': 'ml-interview-questions',
  '_id': 'Ibez-ZEBwj93mx26wpoG',
  '_score': 0.8769342,
  '_source': {'chapter': 'CHAPTER 8',
   'text': 'Follow this checklist to create a plan for your interview process. Refer back to rele‐ vant content or past exercises in this book to help you complete the checklist: • Write down the parts of the ML lifecycle that you’re interested in doing at work. See Figure 1-5 in Chapter 1 for a reminder of the ML lifecycle. • Write down the skills that are required for that role and run the self-assessment of them in Chapter 2 . • Determine what types of interviews could be relevant for that role. Review the overview of the interview process in Chapter 1 . • Make sure your resume is tidied up, with bullet points relevant to the role you picked. Refer to Chapter 2 for more resume tips. • Write down a time frame during which you’re aiming to prepare for interviews and start applying. For example: I aim to prepare for interviews for three months and then sta

## Semantic search

In [18]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [26]:
response = es_client.search(
    index=index_name,
    body={
        "size": 5,
        "knn": {
            "field": "text_vector",
            "query_vector": vector_search_term,
            "k": 5,
            "num_candidates": 10000
        },
        "query": {
            "bool": {
                "must": [
                    {
                        "match": {"chapter": "CHAPTER 3"}
                    }
                ]
            }
        }
    }
)

response['hits']['hits']


[{'_index': 'ml-interview-questions',
  '_id': '3Lez-ZEBwj93mx26upki',
  '_score': 2.817597,
  '_source': {'text': 'In Chapter 1 , you learned about the various steps you will go through as part of your ML interviews. In Chapter 2 , you looked at how to tie your experiences to roles of interest as well as how to craft a relevant resume. The goal of the previous chapters was to get you invited to interviews. In this chapter, I’ll focus on ML algorithms. As you recall, the interview process is illustrated in Figure 1-9 , and the ML algorithms interview is only one portion of the technical interviews; the rest, such as ML training and evaluation, coding, and so on, will be covered in subsequent chapters.',
   'text_vector': [0.0555436834692955,
    -0.057862937450408936,
    -0.06579995900392532,
    -0.018207866698503494,
    0.008833812549710274,
    0.02567276917397976,
    -0.01176581159234047,
    -0.03683503344655037,
    0.007987787015736103,
    -0.03394889086484909,
    0.0660235

#### Score == 2.81. When semanticsearch with elasticsearch can be more than 1. 

In [32]:
response = es_client.search(
    index=index_name,
    body={
        "size": 5,
        "knn": {
            "field": "text_vector",
            "query_vector": vector_search_term,
            "k": 5,
            "num_candidates": 10000
        },
        "query": {
            "bool": {
                "must": [
                    {
                        "match": {"chapter": "CHAPTER 3"}
                    }
                ]
            }
        },
        "explain": True
    }
)

response['hits']['hits']


[{'_shard': '[ml-interview-questions][0]',
  '_node': 'EScPTzvSSEeYjH0xRE7sig',
  '_index': 'ml-interview-questions',
  '_id': '3Lez-ZEBwj93mx26upki',
  '_score': 2.817597,
  '_source': {'text': 'In Chapter 1 , you learned about the various steps you will go through as part of your ML interviews. In Chapter 2 , you looked at how to tie your experiences to roles of interest as well as how to craft a relevant resume. The goal of the previous chapters was to get you invited to interviews. In this chapter, I’ll focus on ML algorithms. As you recall, the interview process is illustrated in Figure 1-9 , and the ML algorithms interview is only one portion of the technical interviews; the rest, such as ML training and evaluation, coding, and so on, will be covered in subsequent chapters.',
   'text_vector': [0.0555436834692955,
    -0.057862937450408936,
    -0.06579995900392532,
    -0.018207866698503494,
    0.008833812549710274,
    0.02567276917397976,
    -0.01176581159234047,
    -0.0368