In [1]:
import pandas as pd

import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../../data/data.csv')
documents = df.to_dict(orient='records')

In [3]:
documents[0]

{'id': 0,
 'section': 'Overview of This Book',
 'text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer. Especially for new graduates 2 and readers coming from different industries, this chapter helps get everyone on the same page as well as clarifies the process. The interconnecting pieces of interviews ar

## Split into chunks/sentence

In [4]:
import spacy

nlp = spacy.load("es_core_news_sm") 

In [5]:
def split_into_sentence_chunks(text, base_id):
    doc = nlp(text)  
    sentences = [sent.text for sent in doc.sents] 

    chunked_texts = []
    current_chunk = []

    for sentence in sentences:
        current_chunk.append(sentence)

        if len(current_chunk) >= 3:  
            chunked_texts.append(' '.join(current_chunk))
            current_chunk = []  

    if current_chunk:
        chunked_texts.append(' '.join(current_chunk))

    chunk_ids = [f"{base_id}_{i + 1}" for i in range(len(chunked_texts))]

    return [{'chunk_id': chunk_id, 'chunk_text': chunk_text, 'text_id': base_id} 
            for chunk_id, chunk_text in zip(chunk_ids, chunked_texts)]

chunked_docs = []
for doc in documents:
    if 'text' in doc and 'text_id' in doc:  
        chunks = split_into_sentence_chunks(doc['text'], doc['text_id'])
        chunked_docs.extend(chunks)  



In [6]:
chunked_docs[0:2]

[{'chunk_id': '86fd49a66d_1',
  'chunk_text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers.',
  'text_id': '86fd49a66d'},
 {'chunk_id': '86fd49a66d_2',
  'chunk_text': 'These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer.',
  'text_id': '86fd49a66d'}]

## minsearch

In [7]:
from minsearch import Index


In [8]:
index = Index(
    text_fields=[ "chunk_text"],
    keyword_fields=["text_id", "chunk_id"]
)


index.fit(chunked_docs)

<minsearch.minsearch.Index at 0x7b4432b9a3b0>

In [9]:
query = 'what is the scope of a data scientist?'

def search(query, boost= None):
    if boost is None:
        boost = {}
    # boost = {'text': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=5
    )

    return results

# search_results = search(query)
# search_results

## rag

In [10]:
import ollama
from tqdm.auto import tqdm

# to initiate ollama on console for the first time
# ollama serve
# ollama pull llama2

In [11]:
client = ollama.Client()



## Prompt


In [12]:
prompt_template = """
You are an assistant preparing a candidate for a data science job interview. 
Based on the provided context, please provide a concise and accurate answer to the following question in plain text format without any additional formatting.

QUESTION: {question}

CONTEXT:
{context}

The answer has to be plain text
""".strip()


In [13]:
query

'what is the scope of a data scientist?'

In [14]:
def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        text = doc['chunk_text']
            
        context += f"Text: {text}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

# prompt = build_prompt(query, search_results)
# print(prompt)

In [15]:
def generate_answer(query, search_results):
    # Generar el contenido del mensaje utilizando la función build_prompt
    message_content = build_prompt(query, search_results)
    
    # Hacer la llamada al modelo Llama2
    response = client.chat(model="llama2", messages=[{"role": "user", "content": message_content}])
    
    # Verificar la respuesta y extraer el contenido
    if 'message' in response and 'content' in response['message']:
        content = response['message']['content']
        
        return content.strip()  
    return ""  


In [16]:
# response = generate_answer(query, search_results)
# print(response)

In [17]:
def rag(query):
    search_results = search(query)
    response = generate_answer(query, search_results)
    return response

    

In [18]:
query = 'Which skills are important for a data scietist?'
print(rag(query))

Based on the provided context, the important skills for a data scientist are:

1. Machine learning algorithms and data intuition
2. Programming and software engineering skills
3. Execution and communication skills
4. Domain knowledge
5. Real and relevant experience that can be applied to the job, including transferable skills
6. Soft skills, such as working well with people in the team and communicating with broader groups of people
7. Technical skills, such as making individual technical contributions.


## Retrieval evaluation

In [19]:
import pandas as pd

df = pd.read_csv('../../data/ground_truth_data.csv')
df

Unnamed: 0,id,question,text_id,chapter,title,section
0,0,How do you evaluate the performance of a machi...,86fd49a66d,CHAPTER 1,Machine Learning Roles and the Interview Process,Overview of This Book
1,1,Can you explain the difference between supervi...,86fd49a66d,CHAPTER 1,Machine Learning Roles and the Interview Process,Overview of This Book
2,2,How do you handle missing data in a machine le...,86fd49a66d,CHAPTER 1,Machine Learning Roles and the Interview Process,Overview of This Book
3,3,What is your approach to debugging a machine l...,86fd49a66d,CHAPTER 1,Machine Learning Roles and the Interview Process,Overview of This Book
4,4,Can you describe a time when you had to commun...,86fd49a66d,CHAPTER 1,Machine Learning Roles and the Interview Process,Overview of This Book
...,...,...,...,...,...,...
235,235,How do benefits like health and dental impact ...,dee0126444,CHAPTER 9,Post-Interview and Follow-up,Steps of the Offer Stage
236,236,Can you provide examples of non-base pay optio...,dee0126444,CHAPTER 9,Post-Interview and Follow-up,Steps of the Offer Stage
237,237,How can data scientists ensure they are contri...,2ca59d8bf2,CHAPTER 9,Post-Interview and Follow-up,First 30/60/90 Days of Your New ML Job
238,238,Can you share an experience where reaching out...,2ca59d8bf2,CHAPTER 9,Post-Interview and Follow-up,First 30/60/90 Days of Your New ML Job


In [20]:
df_questions = df[['question', 'text_id']]

In [21]:
ground_truth = df_questions.to_dict(orient = 'records')
ground_truth[0]

{'question': 'How do you evaluate the performance of a machine learning model?',
 'text_id': '86fd49a66d'}

In [22]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if isinstance(line, (list, tuple)) and True in line:
            cnt += 1

    return cnt / len(relevance_total)


In [23]:
def mrr(relevance_total):
    total_score = 0.0
    num_queries = len(relevance_total)

    for line in relevance_total:
        query_score = 0.0
        for rank in range(len(line)):
            if line[rank] == True:
                query_score = 1 / (rank + 1)
                break  

        total_score += query_score

    return total_score / num_queries if num_queries > 0 else 0.0


In [24]:
def minsearch_search(query):
    boost = {}
    # boost = {'text': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=5)

    return results



In [25]:
from tqdm.auto import tqdm


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['text_id']
        results = search_function(q)
        relevance = [d['text_id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [26]:
from tqdm.auto import tqdm

In [27]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/240 [00:00<?, ?it/s]

{'hit_rate': 0.5666666666666667, 'mrr': 0.4615972222222222}

## Hyperparams Optimization


In [28]:
def minsearch_search(query, boost_value):
    boost = {'text': boost_value}
    
    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )
    return results


In [29]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [30]:
df_val = df_questions[:100]
df_test = df_questions[100:]

In [31]:
space = {
    'temperature': hp.uniform('temperature', 0.6, 1.2),   
    'top_p': hp.uniform('top_p', 0.7, 1.0),               
    'max_length': hp.quniform('max_length', 512, 1024, 1), 
    'boost': hp.uniform('boost', 0, 3),
    
}

In [32]:

def objective(params):
    print(f"Evaluating with params: {params}")
    
    # temperature = params['temperature']
    # top_p = params['top_p']
    # max_length = int(params['max_length'])
    # boost_value = params['boost']
    
    response = rag(query)

    hit_rate_value = hit_rate(response)  
    mrr_value = mrr(response)
    
    loss = - (hit_rate_value + mrr_value)
    
    return {'loss': loss, 'status': STATUS_OK}

trials = Trials()

best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=10,  
    trials=trials
)

print(f"Best hyperparameters: {best}")


Evaluating with params: {'boost': 1.4083920142144328, 'max_length': 898.0, 'temperature': 1.0363851605596968, 'top_p': 0.9234162834045857}
Evaluating with params: {'boost': 2.4824357701591673, 'max_length': 753.0, 'temperature': 1.028863965623605, 'top_p': 0.9998309712816318}
Evaluating with params: {'boost': 0.7441887026830823, 'max_length': 1004.0, 'temperature': 1.1967223109758782, 'top_p': 0.72618723387892}
Evaluating with params: {'boost': 0.7266393322158512, 'max_length': 773.0, 'temperature': 0.822395293004408, 'top_p': 0.810337415056985}
Evaluating with params: {'boost': 1.7332862446524215, 'max_length': 604.0, 'temperature': 1.019658894990053, 'top_p': 0.8274010070681707}
Evaluating with params: {'boost': 2.0265367651186135, 'max_length': 827.0, 'temperature': 1.0664202759601067, 'top_p': 0.9211059882327661}
Evaluating with params: {'boost': 1.109586415780938, 'max_length': 996.0, 'temperature': 1.0775772733800535, 'top_p': 0.7130915839663637}
Evaluating with params: {'boost':

In [33]:
best_temperature = best['temperature']
best_top_p = best['top_p']
best_max_length = int(best['max_length'])
boost = best['boost']
# response = rag(query)
# print(response)


In [34]:
import json

with open('../../data/best_hyperparams.json', 'w') as f:
    json.dump(best, f)


In [35]:
gt_val = df_val.to_dict(orient='records')

In [36]:
def minsearch_search_optimized(query, boost):
    # boost = {'text': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=5)

    return results


In [37]:
boost = {'text': best['boost']}
         
evaluate(gt_val, lambda q: minsearch_search_optimized(q['question'], boost))
# para mirar cuanto da con los mejores hyperparam 

  0%|          | 0/100 [00:00<?, ?it/s]

{'hit_rate': 0.53, 'mrr': 0.4268333333333334}

A little bit better :)

## RAG Evaluation

In [38]:
prompt1_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()


In [39]:
len(ground_truth) 

240

In [40]:
ground_truth[0]

{'question': 'How do you evaluate the performance of a machine learning model?',
 'text_id': '86fd49a66d'}

In [41]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)

In [42]:
print(answer_llm)

To evaluate the performance of a machine learning model, there are several metrics that can be used depending on the type of problem being solved and the data available. Some common metrics include:

1. Accuracy: This is the proportion of correctly classified instances in the test dataset. It provides a general overview of the model's performance.
2. Precision: This is the proportion of true positives (correctly predicted instances) among all positive predictions made by the model. It measures the accuracy of the model's predictions for positive instances.
3. Recall: This is the proportion of true positives that were correctly predicted by the model among all actual positive instances. It measures the model's ability to detect all positive instances.
4. F1 Score: This is the harmonic mean of precision and recall, and provides a balanced measure of both.
5. Mean Squared Error (MSE): This is the average squared difference between the predicted values and the actual values in the test dat

In [43]:
prompt = prompt2_template.format(question = question , answer_llm = answer_llm)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: How do you evaluate the performance of a machine learning model?
Generated Answer: To evaluate the performance of a machine learning model, there are several metrics that can be used depending on the type of problem being solved and the data available. Some common metrics include:

1. Accuracy: This is the proportion of correctly classified instances in the test dataset. It provides a general overview of the model's performance.
2. Precision: This is the proportion of true positives (correctly predicted instances) among all positive predictions made by the model. It measures the accuracy of the model's predictions for positive instances.
3. Recall: This is the 

In [44]:
search_results = minsearch_search_optimized(query, boost)
relevance = generate_answer(prompt, search_results)

print(relevance)

{
"Relevance": "RELEVANT",
"Explanation": "The generated answer provides a comprehensive overview of the different metrics that can be used to evaluate the performance of a machine learning model, including accuracy, precision, recall, F1 score, MSE, RMSE, AUC-ROC, and MAE. The answer also highlights the importance of choosing appropriate metrics based on the problem being solved and the data available, as well as using a holdout set or validation set to monitor the model's performance during training. Additionally, the answer mentions the three pillars of ML and data science roles: machine learning algorithms and data intuition, programming and software engineering skills, and execution and communication skills. Overall, the generated answer is relevant to the question and provides useful information for evaluating the performance of a machine learning model."
}


In [45]:
for record in tqdm(ground_truth):
    print(record)

  0%|          | 0/240 [00:00<?, ?it/s]

{'question': 'How do you evaluate the performance of a machine learning model?', 'text_id': '86fd49a66d'}
{'question': 'Can you explain the difference between supervised and unsupervised learning?', 'text_id': '86fd49a66d'}
{'question': 'How do you handle missing data in a machine learning project?', 'text_id': '86fd49a66d'}
{'question': 'What is your approach to debugging a machine learning issue?', 'text_id': '86fd49a66d'}
{'question': 'Can you describe a time when you had to communicate complex machine learning concepts to a non-technical audience?', 'text_id': '86fd49a66d'}
{'question': 'What are some common challenges that data scientists face when working with large datasets?', 'text_id': '9a2356679c'}
{'question': 'How have advances in distributed and parallel computing impacted the machine learning field?', 'text_id': '9a2356679c'}
{'question': 'Can you explain the difference between a machine learning engineer and a product data scientist?', 'text_id': '9a2356679c'}
{'question

In [46]:
evaluations = []

for record in tqdm(ground_truth):
    question = record['question']
    answer_llm = rag(question)
    
    prompt = prompt2_template.format(question = question , answer_llm = answer_llm)
    search_results = minsearch_search_optimized(query, boost)
    relevance = generate_answer(prompt, search_results)
    evaluations.append((record['question'], answer_llm, relevance))

  0%|          | 0/240 [00:00<?, ?it/s]

In [47]:
evaluations[0]

('How do you evaluate the performance of a machine learning model?',
 "To evaluate the performance of a machine learning model, there are several metrics that can be used depending on the type of problem and data available. Some common metrics include:\n\n1. Accuracy: The proportion of correctly classified instances in the test dataset.\n2. Precision: The proportion of true positives (correctly predicted instances) among all positive predictions made by the model.\n3. Recall: The proportion of true positives among all actual positive instances in the test dataset.\n4. F1 score: A measure of balance between precision and recall, calculated as the harmonic mean of precision and recall.\n5. Mean squared error (MSE): A measure of the average squared difference between predicted and actual values for continuous output variables.\n6. Root mean squared error (RMSE): A measure of the average squared difference between predicted and actual values, calculated as the square root of the MSE.\n7. A

In [48]:
df_eval = pd.DataFrame(evaluations, columns=['Question', 'Response', 'Evaluation'])

In [49]:
df_eval

Unnamed: 0,Question,Response,Evaluation
0,How do you evaluate the performance of a machi...,To evaluate the performance of a machine learn...,"{\n""Relevance"": ""RELEVANT"",\n""Explanation"": ""T..."
1,Can you explain the difference between supervi...,"Sure! Here's the answer to the question ""Can y...","{\n ""Relevance"": ""RELEVANT"",\n ""Explanation""..."
2,How do you handle missing data in a machine le...,To handle missing data in a machine learning p...,"{\n""Relevance"": ""RELEVANT"",\n""Explanation"": ""T..."
3,What is your approach to debugging a machine l...,My approach to debugging a machine learning is...,Based on the provided context and the generate...
4,Can you describe a time when you had to commun...,"Certainly! Here's my response:\n\n""I recall a ...","{\n""Relevance"": ""RELEVANT"",\n""Explanation"": ""T..."
...,...,...,...
235,How do benefits like health and dental impact ...,Benefits like health and dental can significan...,Based on the provided context and the generate...
236,Can you provide examples of non-base pay optio...,Sure! Here are some examples of non-base pay o...,Based on the provided context and the generate...
237,How can data scientists ensure they are contri...,To ensure they are contributing to the growth ...,"Based on the generated answer, I would evaluat..."
238,Can you share an experience where reaching out...,Certainly! Here's my answer to the question:\n...,"{\n""Relevance"": ""RELEVANT"",\n""Explanation"": ""T..."


In [50]:
import re 

def categorize_evaluation(text):
    if re.search(r'"NON_RELEVANT"', text):
        return "NON_RELEVANT"
    elif re.search(r'"PARTLY_RELEVANT"', text):
        return "PARTLY_RELEVANT"
    elif re.search(r'"RELEVANT"', text):
        return "RELEVANT"
    else:
        return "UNKNOWN"

df_eval['Category'] = df_eval['Evaluation'].apply(categorize_evaluation)

category_counts = df_eval['Category'].value_counts()

In [51]:
category_counts

Category
RELEVANT           231
PARTLY_RELEVANT      9
Name: count, dtype: int64

In [52]:
normalized_counts = df_eval['Category'].value_counts(normalize= True)
normalized_counts

Category
RELEVANT           0.9625
PARTLY_RELEVANT    0.0375
Name: proportion, dtype: float64