# Retrieval approach with Elasticsearch

### Imports

In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='tqdm')

import json
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd

### Pretrained Model used for creation of embeddings

The model used to create the embeddings can be found in this website
https://www.sbert.net/docs/sentence_transformer/pretrained_models.html#semantic-search-models

In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [3]:
print(f'The output of the model has {len(model.encode("How many features or dimensions the model uses to represent the input text?"))} dimensional embeddings')

The output of the model has 768 dimensional embeddings


### Flattening the book

In [4]:
with open('../../data/parsed_book.json', 'rt') as f_in:
    book_raw = json.load(f_in)

In [5]:
book_raw[0]

{'chapter': 'CHAPTER 1',
 'title': 'Machine Learning Roles and the Interview Process',
 'content': [{'section': 'Overview of This Book',
   'text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer. Especially for new graduates 2 and readers coming from different industries, this chapter helps get everyone on

In [6]:
documents = []

for chapter in book_raw:
    chapter_name = chapter['chapter']
    title = chapter['title']

    for doc in chapter['content']:
        new_doc = {
            'chapter': chapter_name,
            'title': title,
            'section': doc['section'],
            'text': doc['text']
        }
        documents.append(new_doc) 

In [7]:
documents[0]

{'chapter': 'CHAPTER 1',
 'title': 'Machine Learning Roles and the Interview Process',
 'section': 'Overview of This Book',
 'text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer. Especially for new graduates 2 and readers coming from different industries, this chapter helps get everyone on the same page 

# Setup Elasticsearch connection

### run on the console

sudo docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3

In [8]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': 'c61a76b457df', 'cluster_name': 'docker-cluster', 'cluster_uuid': '7a89_kRqR-GfIP4u79eGKQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

# Create mappings and Index

Imagine that you need to create a schema. what do you need? I would say the column names, the table name, the type of data you are going to introduce...

The mapping will set this metadata.

In [9]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "analyzer": {
                "standard_analyzer": {
                "type": "standard"
                }
            }
        }
    },
    "mappings": {
    "properties": {
        "text": {
            "type": "text",
            "analyzer": "standard_analyzer"  
        },
        "section": {
            "type": "keyword",
        },
        "chapter": {
            "type": "keyword",
        },
        "title": {
            "type": "keyword",
        },
        "text_vector": {
            "type": "dense_vector",
            "dims": 768, # got them above
            "index": True,
            "similarity": "cosine"
        }
    }
}

}

In [10]:
index_name = "ds-interview-questions"

# it is better to delete the index every time when experimenting
es_client.indices.delete(index=index_name, ignore_unavailable=True) 
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'ds-interview-questions'})

### Add documents to the index

In [11]:
for doc in documents:
    try:
        es_client.index(index=index_name, body=doc)
    except Exception as e:
        print(f"Error when indexing the document: {e}")

### Create user query

In [12]:
search_term = "which are the steps of the data science interview process?"
vector_search_term = model.encode(search_term)

### Create search function

In [13]:
def execute_search(query, index=index_name):
    """
    Execute a search query on the specified index.

    Parameters:
        query (dict): The search query to execute.
        index (str): The name of the index to search.

    Returns:
        None: Prints the search results.
    """
    try:
        response = es_client.search(index=index, body=query)
        return response
    except Exception as e:
        print(f"Error during search: {e}")

# Full-text search

In [15]:
full_text_query = {
    "size": 5,  
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "text": {
                            "query": search_term,
                            "boost": 1.0  
                        }
                    }
                },
                {
                    "multi_match": {
                        "query": search_term,
                        "fields": ["text^2", "section", "title"],  
                        "type": "best_fields"
                    }
                }
            ],
            "should": [
                {
                    "terms": {
                        "title": ["technical", "behavioral"],
                          
                    }
                }
            ]
        }
    }
}

full_text_results = execute_search(full_text_query)
full_text_results['hits']['hits'][0]

{'_index': 'ds-interview-questions',
 '_id': 'Alnyn5IBsRH_JjIM5yfP',
 '_score': 15.321497,
 '_source': {'chapter': 'CHAPTER 1',
  'title': 'Machine Learning Roles and the Interview Process',
  'section': 'The Three Pillars of Machine Learning Roles',
  'text': 'To set the stage for the rest of the book, I’ll go over what I call the three pillars of ML and data science roles: • Machine learning algorithms and data intuition • Programming and software engineering skills • Execution and communication skills These are the broad categories of skills that you will be evaluated on during ML job interviews. This book focuses a lot on helping you understand these skills and bridge any gaps between your current experiences and skills and those under these three pillars (see Figure 1-6 ). All these skills will be expanded on in the following chapters. Figure 1-6. Three pillars of machine learning jobs. You’re able to understand the underlying workings of ML algorithms and statistics theory and th

# Semantic search

### Create the dense vector using the pre-trained model

A dense vector typically represents a word, sentence, or document as a fixed-length array of numbers, also known as an embedding. Dense vectors are crucial for Elasticsearch, when we want to perform tasks where understanding the meaning behind the words is more important than just matching exact terms.

In [16]:
operations = []
for doc in documents:
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

In [17]:
operations[1]

{'chapter': 'CHAPTER 1',
 'title': 'Machine Learning Roles and the Interview Process',
 'section': 'A Brief History of Machine Learning and Data Science Job Titles',
 'text': 'First, let’s walk through a brief history of job titles. I decided to start with this section to dispel some myths about the “data scientist” job title and shed some light on why there are so many ML-related job titles. After understanding this history, you should be more aware of what job titles to aim for yourself. If you’ve ever been confused about the litany of titles such as machine learning engineer (MLE), product data sci‐ entist, MLOps engineer, and more, this section is for you. ML techniques aren’t a new thing; in 1985, David Ackley, Geoffrey E. Hinton, and Terrence J. Sejnowski popularized the Boltzmann Machine algorithm. 3 Even before that, regression techniques 4 had early developments in the 1800s. There have long been jobs and roles that use modeling techniques to forecast and predict. Econome‐ tri

### Add documents to the index

In [18]:
for doc in operations:
    try:
        es_client.index(index=index_name, body=doc)
    except Exception as e:
        print(f"Error when indexing the document: {e}")

In [20]:
semantic_query = {
    "size": 5,
    "knn": {
        "field": "text_vector",
        "query_vector": vector_search_term,
        "k": 3,  
        "num_candidates": 1000  
    },
    "_source": ["text", "section", "title"] #fields I want to retrieve
}

semantic_results = execute_search(semantic_query)
semantic_results['hits']['hits'][0]

{'_index': 'ds-interview-questions',
 '_id': 'SVnzn5IBsRH_JjIMNycO',
 '_score': 0.80450153,
 '_source': {'section': 'Python Coding Interview: Data- and ML-Related Questions',
  'text': 'Now, let’s dive into the first type of programming/coding interview questions: data Now, let’s dive into the first type of programming/coding interview questions: dataand ML-related questions. These questions focus on using Python, such as using the NumPy and pandas libraries or ML libraries like XGBoost, to code up solutions to interview questions. The main difference between this type of question and the brain‐ teaser/LeetCode questions covered in the next subsection is that this type of question will relate more to what you’d be doing in your day-to-day role in an ML job. Depending on the type of company you’re interviewing for, these questions may be themed around the company’s product. For example, a social media company could ask a series of questions on how you’d pull information about new user s

# Hybrid Search

Combination of both full-text and vector search

In [21]:
text_vector_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "text": {
                            "query": search_term,
                            "boost": 1.0
                        }
                    }
                },
                {
                    "multi_match": {
                        "query": search_term,
                        "fields": ["text^2", "section", "title"],
                        "type": "best_fields"
                    }
                }
            ],
            "should": [
                {
                    "terms": {
                        "title": ["technical", "behavioral"]
                    }
                }
            ]
        }
    },
    "knn": {
        "field": "text_vector",
        "query_vector": vector_search_term,
        "k": 3,
        "num_candidates": 1000
    }
}

text_vector_results = execute_search(text_vector_query)
text_vector_results['hits']['hits'][0]


{'_index': 'ds-interview-questions',
 '_id': 'Alnyn5IBsRH_JjIM5yfP',
 '_score': 15.1664,
 '_source': {'chapter': 'CHAPTER 1',
  'title': 'Machine Learning Roles and the Interview Process',
  'section': 'The Three Pillars of Machine Learning Roles',
  'text': 'To set the stage for the rest of the book, I’ll go over what I call the three pillars of ML and data science roles: • Machine learning algorithms and data intuition • Programming and software engineering skills • Execution and communication skills These are the broad categories of skills that you will be evaluated on during ML job interviews. This book focuses a lot on helping you understand these skills and bridge any gaps between your current experiences and skills and those under these three pillars (see Figure 1-6 ). All these skills will be expanded on in the following chapters. Figure 1-6. Three pillars of machine learning jobs. You’re able to understand the underlying workings of ML algorithms and statistics theory and thei

# Evaluation

In [22]:
gt_df = pd.read_csv('../../data/ground_truth_data.csv')
gt_df

Unnamed: 0,question,text_id,chapter,title,section
0,Can you describe the differences between a dat...,86fd49a66d,CHAPTER 1,Machine Learning Roles and the Interview Process,Overview of This Book
1,How do you approach technical interviews for d...,86fd49a66d,CHAPTER 1,Machine Learning Roles and the Interview Process,Overview of This Book
2,Can you walk me through your process for evalu...,86fd49a66d,CHAPTER 1,Machine Learning Roles and the Interview Process,Overview of This Book
3,In what ways can a data scientist collaborate ...,86fd49a66d,CHAPTER 1,Machine Learning Roles and the Interview Process,Overview of This Book
4,How do you stay current with the latest develo...,86fd49a66d,CHAPTER 1,Machine Learning Roles and the Interview Process,Overview of This Book
...,...,...,...,...,...
235,What is the most effective way to network and ...,1026686599,CHAPTER 9,Post-Interview and Follow-up,What to Do Between Interviews
236,How do you balance the need for a strong resum...,1026686599,CHAPTER 9,Post-Interview and Follow-up,What to Do Between Interviews
237,"How do RSUs and stock options differ, and what...",dee0126444,CHAPTER 9,Post-Interview and Follow-up,Steps of the Offer Stage
238,Can you provide examples of non-base pay optio...,dee0126444,CHAPTER 9,Post-Interview and Follow-up,Steps of the Offer Stage


In [None]:
results = {}

for index, row in gt_df.iterrows():
    search_term = row['question']
    vector_search_term = model.encode(search_term).tolist()  

    # Update the queries with the current search term
    full_text_query['query']['bool']['must'][0]['match']['text']['query'] = search_term
    semantic_query['knn']['query_vector'] = vector_search_term
    text_vector_query['knn']['query_vector'] = vector_search_term

    # Execute each search type
    full_text_results = execute_search(full_text_query)  # Execute full-text search
    semantic_results = execute_search(semantic_query)    # Execute semantic search
    text_vector_results = execute_search(text_vector_query)  # Execute text-vector search

    # Collect results
    results[search_term] = {
        'full_text': full_text_results['hits']['hits'],
        'semantic': semantic_results['hits']['hits'],
        'text_vector': text_vector_results['hits']['hits'],
        'expected_answers': row['question']  # Adjust this if your column name is different
    }


results

In [None]:
results

In [28]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_search(results, gt_df):
    evaluations = {}
    
    for index, row in gt_df.iterrows():
        search_term = row['question']  # Access the 'question' field correctly
        expected_answers = set(row['expected_answers']) if 'expected_answers' in row else set()  # Make sure this column exists

        # Get the actual results for the current search term
        actual_full_text_set = set(doc['_source']['title'] for doc in results[search_term]['full_text'])
        actual_semantic_set = set(doc['_source']['title'] for doc in results[search_term]['semantic'])
        actual_text_vector_set = set(doc['_source']['title'] for doc in results[search_term]['text_vector'])

        # Calculate precision, recall, and F1 score for full text search
        precision_full_text = precision_score(list(expected_answers), list(actual_full_text_set), average='binary', zero_division=0)
        recall_full_text = recall_score(list(expected_answers), list(actual_full_text_set), average='binary', zero_division=0)
        f1_full_text = f1_score(list(expected_answers), list(actual_full_text_set), average='binary', zero_division=0)
        
        # Repeat for semantic search
        precision_semantic = precision_score(list(expected_answers), list(actual_semantic_set), average='binary', zero_division=0)
        recall_semantic = recall_score(list(expected_answers), list(actual_semantic_set), average='binary', zero_division=0)
        f1_semantic = f1_score(list(expected_answers), list(actual_semantic_set), average='binary', zero_division=0)

        # Repeat for text vector search
        precision_vector = precision_score(list(expected_answers), list(actual_text_vector_set), average='binary', zero_division=0)
        recall_vector = recall_score(list(expected_answers), list(actual_text_vector_set), average='binary', zero_division=0)
        f1_vector = f1_score(list(expected_answers), list(actual_text_vector_set), average='binary', zero_division=0)

        # Store evaluations
        evaluations[search_term] = {
            'full_text': {
                'precision': precision_full_text,
                'recall': recall_full_text,
                'f1': f1_full_text
            },
            'semantic': {
                'precision': precision_semantic,
                'recall': recall_semantic,
                'f1': f1_semantic
            },
            'vector': {
                'precision': precision_vector,
                'recall': recall_vector,
                'f1': f1_vector
            }
        }

    return evaluations

# Run the evaluation
evaluation_results = evaluate_search(results, gt_df)


ValueError: Found input variables with inconsistent numbers of samples: [0, 3]