In [1]:
import json
from sentence_transformers import SentenceTransformer

import warnings
warnings.filterwarnings('ignore')

  from tqdm.autonotebook import tqdm, trange


## Run 

sudo docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3


In [2]:
model = SentenceTransformer("all-mpnet-base-v2")
len(model.encode("This is a simple sentence"))

768

In [4]:
with open('../../data/parsed_book.json', 'rt') as f_in:
    book_raw = json.load(f_in)

In [5]:
book_raw

[{'chapter': 'CHAPTER 1',
  'title': 'Machine Learning Roles and the Interview Process',
  'content': [{'section': 'Overview of This Book',
    'text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer. Especially for new graduates 2 and readers coming from different industries, this chapter helps get everyon

In [6]:
documents = []

for doc in book_raw: 
    for section in doc['content']:
        if 'text' in section:
            section["text_vector"] = model.encode(section["text"]).tolist()
        section['chapter'] = doc['chapter']
        section['title'] = doc['title']
        documents.append(section)


In [7]:
documents[0:3]

[{'section': 'Overview of This Book',
  'text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer. Especially for new graduates 2 and readers coming from different industries, this chapter helps get everyone on the same page as well as clarifies the process. The interconnecting pieces of interviews are comple

## Create embeddings using Pretrained Models

## Setup ElasticSearch connection

In [8]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': '0f03fdd5ed59', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'zftd0js-QliyxlXDMZe_Ag', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [9]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
    "properties": {
        "text": {
            "type": "text",
        },
        "section": {
            "type": "text",
        },
        "chapter": {
            "type": "text",
        },
        "title": {
            "type": "keyword",
            "null_value": "Untitled"
        },
        "text_vector": {
            "type": "dense_vector",
            "dims": 768,
            "index": True,
            "similarity": "cosine"
        }
    }
}

}


In [10]:
index_name = "ml-interview-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'ml-interview-questions'})

In [11]:
for doc in documents:
    try:
        es_client.index(index=index_name, body=doc)
    except Exception as e:
        print(f"Error when indexing the document: {e}")


## Create end user query

In [12]:
search_term = "what are the steps of ML interviews?"
vector_search_term = model.encode(search_term)

In [18]:
res = es_client.search(
    index="ml-interview-questions",
    body={
        "size": 5,  
        "knn": {
            "field": "text_vector",  
            "query_vector": vector_search_term,
            "k": 5,  
            "num_candidates": 1000  
        },
        "_source": ["text", "section", "title", "chapter"]
    }
)
res['hits']['hits']

[{'_index': 'ml-interview-questions',
  '_id': '0G_GvpIBz3e0e2tICUjP',
  '_score': 0.89045584,
  '_source': {'chapter': 'CHAPTER 8',
   'section': 'Interview Preparation Checklist',
   'text': 'Now that you’ve gone through the entire ML interview process, it’s time to create a plan. In Chapters 1 and 2 , you learned about the many types of ML jobs and did a self-assessment of which one(s) might be more suitable for you. Based on that, you also learned about the skills you are expected to be stronger in. In the subsequent chapters, you learned about what types of questions are commonly asked in inter‐ views. Are there any types that you need to prepare more for? The goal of this book is for you to start bridging the gap, not just read about bridging the gap. To succeed in interviews and land the job, taking action will help you—not just thinking about taking action. Follow this checklist to create a plan for your interview process. Refer back to rele‐ vant content or past exercises in t

## Semantic search

In [14]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [19]:
response = es_client.search(
    index=index_name,
    body={
        "size": 5,
        "knn": {
            "field": "text_vector",
            "query_vector": vector_search_term,
            "k": 5,
            "num_candidates": 10000
        },
        "query": {
            "bool": {
                "must": [
                    {
                        "match": {"chapter": "CHAPTER 3"}
                    }
                ]
            }
        }
    }
)

response['hits']['hits']


[{'_index': 'ml-interview-questions',
  '_id': 'tm_GvpIBz3e0e2tICUhP',
  '_score': 2.7221491,
  '_source': {'section': 'Overview of the Machine Learning Algorithms Technical Interview',
   'text': 'In Chapter 1 , you learned about the various steps you will go through as part of your ML interviews. In Chapter 2 , you looked at how to tie your experiences to roles of interest as well as how to craft a relevant resume. The goal of the previous chapters was to get you invited to interviews. In this chapter, I’ll focus on ML algorithms. As you recall, the interview process is illustrated in Figure 1-9 , and the ML algorithms interview is only one portion of the technical interviews; the rest, such as ML training and evaluation, coding, and so on, will be covered in subsequent chapters. You’re likely to be asked ML algorithm technical questions in an interview if you’re applying for any of the following jobs: • Data scientist who builds ML models • Machine learning engineer • Applied scient

#### Score == 2.81. When semanticsearch with elasticsearch can be more than 1. The value doesn't have a maximum and it's just used to compare results

In [20]:
response = es_client.search(
    index=index_name,
    body={
        "size": 5,
        "knn": {
            "field": "text_vector",
            "query_vector": vector_search_term,
            "k": 5,
            "num_candidates": 10000
        },
        "query": {
            "bool": {
                "must": [
                    {
                        "match": {"chapter": "CHAPTER 3"}
                    }
                ]
            }
        },
        "explain": True
    }
)

response['hits']['hits']


[{'_shard': '[ml-interview-questions][0]',
  '_node': 'ee94bDjxSb-3HOok47wjkA',
  '_index': 'ml-interview-questions',
  '_id': 'tm_GvpIBz3e0e2tICUhP',
  '_score': 2.7221491,
  '_source': {'section': 'Overview of the Machine Learning Algorithms Technical Interview',
   'text': 'In Chapter 1 , you learned about the various steps you will go through as part of your ML interviews. In Chapter 2 , you looked at how to tie your experiences to roles of interest as well as how to craft a relevant resume. The goal of the previous chapters was to get you invited to interviews. In this chapter, I’ll focus on ML algorithms. As you recall, the interview process is illustrated in Figure 1-9 , and the ML algorithms interview is only one portion of the technical interviews; the rest, such as ML training and evaluation, coding, and so on, will be covered in subsequent chapters. You’re likely to be asked ML algorithm technical questions in an interview if you’re applying for any of the following jobs: • 