In [1]:
import pandas as pd

import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../data/gold/data.csv')
documents = df.to_dict(orient='records')

In [3]:
documents[0]

{'id': 0,
 'section': 'Overview of This Book',
 'text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer. Especially for new graduates 2 and readers coming from different industries, this chapter helps get everyone on the same page as well as clarifies the process. The interconnecting pieces of interviews ar

## Split into chunks/sentence

In [4]:
import spacy

nlp = spacy.load("es_core_news_sm") 

In [5]:
def split_into_sentence_chunks(text, base_id):
    doc = nlp(text)  
    sentences = [sent.text for sent in doc.sents] 

    chunked_texts = []
    current_chunk = []

    for sentence in sentences:
        current_chunk.append(sentence)

        if len(current_chunk) >= 3:  
            chunked_texts.append(' '.join(current_chunk))
            current_chunk = []  

    if current_chunk:
        chunked_texts.append(' '.join(current_chunk))

    chunk_ids = [f"{base_id}_{i + 1}" for i in range(len(chunked_texts))]

    return [{'chunk_id': chunk_id, 'chunk_text': chunk_text, 'text_id': base_id} 
            for chunk_id, chunk_text in zip(chunk_ids, chunked_texts)]

chunked_docs = []
for doc in documents:
    if 'text' in doc and 'text_id' in doc:  
        chunks = split_into_sentence_chunks(doc['text'], doc['text_id'])
        chunked_docs.extend(chunks)  



In [6]:
chunked_docs[0:2]

[{'chunk_id': '86fd49a66d_1',
  'chunk_text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers.',
  'text_id': '86fd49a66d'},
 {'chunk_id': '86fd49a66d_2',
  'chunk_text': 'These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer.',
  'text_id': '86fd49a66d'}]

## minsearch

In [7]:
from minsearch import Index


In [8]:
index = Index(
    text_fields=[ "chunk_text"],
    keyword_fields=["text_id", "chunk_id"]
)


index.fit(chunked_docs)

<minsearch.minsearch.Index at 0x7e22b458e1a0>

In [9]:
query = 'what is the scope of a data scientist?'

def search(query, boost= None):
    if boost is None:
        boost = {}
    # boost = {'text': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=5
    )

    return results

# search_results = search(query)
# search_results

## rag

In [10]:
import ollama
from tqdm.auto import tqdm

# to initiate ollama on console for the first time
# ollama serve
# ollama pull llama2

In [11]:
client = ollama.Client()



## Prompt


In [12]:
prompt_template = """
You are an assistant preparing a candidate for a data science job interview. 
Based on the provided context, please provide a concise and accurate answer to the following question in plain text format without any additional formatting.

QUESTION: {question}

CONTEXT:
{context}

The answer has to be plain text
""".strip()


In [13]:
query

'what is the scope of a data scientist?'

In [14]:
def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        text = doc['chunk_text']
            
        context += f"Text: {text}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

# prompt = build_prompt(query, search_results)
# print(prompt)

In [15]:
def generate_answer(query, search_results):
    # Generar el contenido del mensaje utilizando la función build_prompt
    message_content = build_prompt(query, search_results)
    
    # Hacer la llamada al modelo Llama2
    response = client.chat(model="llama2", messages=[{"role": "user", "content": message_content}])
    
    # Verificar la respuesta y extraer el contenido
    if 'message' in response and 'content' in response['message']:
        content = response['message']['content']
        
        return content.strip()  
    return ""  


In [16]:
# response = generate_answer(query, search_results)
# print(response)

In [17]:
def rag(query):
    search_results = search(query)
    response = generate_answer(query, search_results)
    return response

    

In [18]:
query = 'Which skills are important for a data scietist?'
print(rag(query))

The following skills are important for a data scientist:

1. Machine learning algorithms and data intuition
2. Programming and software engineering skills
3. Execution and communication skills
4. Domain knowledge
5. Real and relevant experience that can be applied to the job, including transferable skills
6. Soft skills, such as working well with people in a team and communicating with broader groups of people
7. Technical skills, such as making individual technical contributions.


## Retrieval evaluation

In [19]:
import pandas as pd

df = pd.read_csv('../data/ground_truth_data.csv')
df

Unnamed: 0,question,section,chapter,text_id
0,Can you describe the different job titles and ...,Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
1,How does the author of the chapter explain the...,Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
2,"According to the text, what is the main focus ...",Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
3,What is the purpose of the figure provided in ...,Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
4,How does the author encourage readers to appro...,Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
...,...,...,...,...
224,Can you tell us about a time when you gained d...,Post-Interview and Follow-up,CHAPTER 9,2ca59d8bf2
225,How do you approach meeting relevant people du...,Post-Interview and Follow-up,CHAPTER 9,2ca59d8bf2
226,Can you describe a situation where you had to ...,Post-Interview and Follow-up,CHAPTER 9,2ca59d8bf2
227,How do you keep track of your achievements dur...,Post-Interview and Follow-up,CHAPTER 9,2ca59d8bf2


In [20]:
df_questions = df[['question', 'text_id']]

In [21]:
ground_truth = df_questions.to_dict(orient = 'records')
ground_truth[0]

{'question': 'Can you describe the different job titles and roles that use machine learning skills in industry?',
 'text_id': '86fd49a66d'}

In [22]:
ground_truth[0]

{'question': 'Can you describe the different job titles and roles that use machine learning skills in industry?',
 'text_id': '86fd49a66d'}

In [23]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if isinstance(line, (list, tuple)) and True in line:
            cnt += 1
        # elif line is True:
        #     cnt += 1

    return cnt / len(relevance_total)


In [24]:
def mrr(relevance_total):
    total_score = 0.0
    num_queries = len(relevance_total)

    for line in relevance_total:
        query_score = 0.0
        for rank in range(len(line)):
            if line[rank] == True:
                query_score = 1 / (rank + 1)
                break  # Solo necesitamos el primero que sea True

        total_score += query_score

    # Evitar división por cero si no hay consultas
    return total_score / num_queries if num_queries > 0 else 0.0


In [25]:
def minsearch_search(query):
    boost = {}
    # boost = {'text': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=5)

    return results



In [26]:
from tqdm.auto import tqdm


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['text_id']
        results = search_function(q)
        relevance = [d['text_id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [27]:
from tqdm.auto import tqdm

In [28]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/229 [00:00<?, ?it/s]

{'hit_rate': 0.777292576419214, 'mrr': 0.6283842794759823}

## Hyperparams Optimization


In [29]:
def minsearch_search(query, boost_value):
    boost = {'text': boost_value}
    
    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )
    return results


In [30]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [31]:
df_val = df_questions[:100]
df_test = df_questions[100:]

In [32]:
space = {
    'temperature': hp.uniform('temperature', 0.6, 1.2),   
    'top_p': hp.uniform('top_p', 0.7, 1.0),               
    'max_length': hp.quniform('max_length', 512, 1024, 1), 
    'boost': hp.uniform('boost', 0, 3),
    
}

In [33]:

def objective(params):
    print(f"Evaluating with params: {params}")
    
    # temperature = params['temperature']
    # top_p = params['top_p']
    # max_length = int(params['max_length'])
    # boost_value = params['boost']
    
    response = rag(query)

    hit_rate_value = hit_rate(response)  
    mrr_value = mrr(response)
    
    loss = - (hit_rate_value + mrr_value)
    
    return {'loss': loss, 'status': STATUS_OK}

trials = Trials()

best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=10,  
    trials=trials
)

print(f"Best hyperparameters: {best}")


Evaluating with params: {'boost': 0.19169097176227612, 'max_length': 639.0, 'temperature': 0.6073720101599442, 'top_p': 0.9275859294389914}
Evaluating with params: {'boost': 1.7026505167577843, 'max_length': 719.0, 'temperature': 0.8147334649501844, 'top_p': 0.7398525222990507}
Evaluating with params: {'boost': 1.6703663822178891, 'max_length': 1000.0, 'temperature': 0.857998343051559, 'top_p': 0.8407265269806637}
Evaluating with params: {'boost': 0.6973063534893288, 'max_length': 809.0, 'temperature': 1.0343489605907668, 'top_p': 0.8794319043356899}
Evaluating with params: {'boost': 2.740640172730731, 'max_length': 592.0, 'temperature': 1.193254984078656, 'top_p': 0.781041600566971}
Evaluating with params: {'boost': 0.05884893918022149, 'max_length': 882.0, 'temperature': 0.610642616050742, 'top_p': 0.9506859694251677}
Evaluating with params: {'boost': 0.6589882838732926, 'max_length': 567.0, 'temperature': 0.768214284742639, 'top_p': 0.8277815405287754}
Evaluating with params: {'boos

In [34]:
best_temperature = best['temperature']
best_top_p = best['top_p']
best_max_length = int(best['max_length'])
boost = best['boost']
# response = rag(query)
# print(response)


In [35]:
import json

with open('../data/best_hyperparams.json', 'w') as f:
    json.dump(best, f)


In [36]:
gt_val = df_val.to_dict(orient='records')

In [37]:
def minsearch_search_optimized(query, boost):
    # boost = {'text': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=5)

    return results


In [38]:
boost = {'text': best['boost']}
         
evaluate(gt_val, lambda q: minsearch_search_optimized(q['question'], boost))
# para mirar cuanto da con los mejores hyperparam 

  0%|          | 0/100 [00:00<?, ?it/s]

{'hit_rate': 0.83, 'mrr': 0.6703333333333333}

A little bit better :)

## RAG Evaluation

In [39]:
prompt1_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()


In [40]:
len(ground_truth) 

229

In [49]:
ground_truth[0]

{'question': 'Can you describe the different job titles and roles that use machine learning skills in industry?',
 'text_id': '86fd49a66d'}

In [41]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)

In [42]:
print(answer_llm)

There are several job titles and roles that use machine learning skills in industry, including:

1. Data Scientist: responsible for collecting, analyzing, and interpreting large datasets to gain insights and inform business decisions.
2. Machine Learning Engineer: focuses on developing and deploying machine learning models and algorithms to solve complex problems.
3. Applied Scientist: applies scientific knowledge and techniques to real-world problems in a practical setting.
4. Software Engineer, Machine Learning: designs and develops software applications that involve machine learning techniques.
5. MLOps Engineer: responsible for the operationalization of machine learning models, including deployment, monitoring, and maintenance.
6. Product Data Scientist: works closely with product teams to design and implement data-driven products and features.
7. Data Analyst: analyzes and interprets data to help organizations make informed decisions.
8. Decision Scientist: uses statistical models

In [43]:
prompt = prompt2_template.format(question = question , answer_llm = answer_llm)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: Can you describe the different job titles and roles that use machine learning skills in industry?
Generated Answer: There are several job titles and roles that use machine learning skills in industry, including:

1. Data Scientist: responsible for collecting, analyzing, and interpreting large datasets to gain insights and inform business decisions.
2. Machine Learning Engineer: focuses on developing and deploying machine learning models and algorithms to solve complex problems.
3. Applied Scientist: applies scientific knowledge and techniques to real-world problems in a practical setting.
4. Software Engineer, Machine Learning: designs and develops software app

In [46]:
search_results = minsearch_search_optimized(query, boost)
relevance = generate_answer(prompt, search_results)

print(relevance)

Based on the provided context and the generated answer, I would evaluate it as "RELEVANT". The answer provides a comprehensive list of job titles and roles that use machine learning skills in industry, including data scientist, machine learning engineer, applied scientist, software engineer, MLOps engineer, product data scientist, data analyst, decision scientist, and research scientist. These are indeed relevant roles in the field of data science and machine learning, as noted in the context. The answer also provides some explanation for each role and their overlap in skills, which is helpful for the candidate's understanding. Therefore, I would give a relevance evaluation of "RELEVANT" for this answer.


In [51]:
for record in tqdm(ground_truth):
    print(record)

  0%|          | 0/229 [00:00<?, ?it/s]

{'question': 'Can you describe the different job titles and roles that use machine learning skills in industry?', 'text_id': '86fd49a66d'}
{'question': 'How does the author of the chapter explain the confusion among job seekers regarding the differences between various machine learning roles?', 'text_id': '86fd49a66d'}
{'question': 'According to the text, what is the main focus of Chapter 2 of the book?', 'text_id': '86fd49a66d'}
{'question': 'What is the purpose of the figure provided in the chapter (Figure 1-1)?', 'text_id': '86fd49a66d'}
{'question': "How does the author encourage readers to approach the book's content?", 'text_id': '86fd49a66d'}
{'question': 'Can you explain the evolution of job titles in the field of machine learning and data science over the past decade?', 'text_id': '9a2356679c'}
{'question': 'According to the text, what are some of the recent advancements in distributed and parallel computing that have enabled large-scale machine learning projects?', 'text_id':

In [53]:
evaluations = []

for record in tqdm(ground_truth):
    question = record['question']
    answer_llm = rag(question)
    
    prompt = prompt2_template.format(question = question , answer_llm = answer_llm)
    search_results = minsearch_search_optimized(query, boost)
    relevance = generate_answer(prompt, search_results)
    evaluations.append((record['question'], answer_llm, relevance))

  0%|          | 0/229 [00:00<?, ?it/s]

In [55]:
evaluations[0]

('Can you describe the different job titles and roles that use machine learning skills in industry?',
 "Of course! Here are some common job titles and roles that use machine learning skills in industry:\n\n1. Data Scientist: responsible for analyzing and interpreting complex data sets, developing predictive models, and communicating insights to stakeholders.\n2. Machine Learning Engineer: focuses on building and deploying machine learning models within a production environment, optimizing model performance, and ensuring the scalability and reliability of the model.\n3. Applied Scientist: applies scientific techniques and methods to solve real-world problems, often working in collaboration with domain experts.\n4. Software Engineer, Machine Learning: responsible for developing software that incorporates machine learning algorithms and techniques.\n5. MLOps Engineer: focuses on the operationalization of machine learning models, including model deployment, monitoring, and maintenance.\n6.

In [60]:
df_eval = pd.DataFrame(evaluations, columns=['Question', 'Response', 'Evaluation'])

In [61]:
df_eval

Unnamed: 0,Question,Response,Evaluation
0,Can you describe the different job titles and ...,Of course! Here are some common job titles and...,"Based on the provided context and question, I ..."
1,How does the author of the chapter explain the...,"According to the author of the chapter, job se...",Based on the provided context and the generate...
2,"According to the text, what is the main focus ...","According to the text, the main focus of Chapt...","{\n""Relevance"": ""RELEVANT"",\n""Explanation"": ""T..."
3,What is the purpose of the figure provided in ...,The purpose of the figure provided in Chapter ...,"{\n""Relevance"": ""RELEVANT"",\n""Explanation"": ""T..."
4,How does the author encourage readers to appro...,The author encourages readers to approach the ...,Based on the provided context and generated an...
...,...,...,...
224,Can you tell us about a time when you gained d...,Sure! Here's how I would answer the question a...,"Based on the provided answer, I would evaluate..."
225,How do you approach meeting relevant people du...,"During the onboarding process, I approach meet...","{\n""Relevance"": ""RELEVANT"",\n""Explanation"": ""T..."
226,Can you describe a situation where you had to ...,Sure! Here's how I would answer the question:\...,"Based on the generated answer, I would evaluat..."
227,How do you keep track of your achievements dur...,To keep track of my achievements during my fir...,"Based on the content and context provided, I w..."


In [68]:
import re 

def categorize_evaluation(text):
    if re.search(r'"NON_RELEVANT"', text):
        return "NON_RELEVANT"
    elif re.search(r'"PARTLY_RELEVANT"', text):
        return "PARTLY_RELEVANT"
    elif re.search(r'"RELEVANT"', text):
        return "RELEVANT"
    else:
        return "UNKNOWN"

df_eval['Category'] = df_eval['Evaluation'].apply(categorize_evaluation)

category_counts = df_eval['Category'].value_counts()

In [69]:
category_counts

Category
RELEVANT           213
PARTLY_RELEVANT     16
Name: count, dtype: int64

In [71]:
normalized_counts = df_eval['Category'].value_counts(normalize= True)
normalized_counts

Category
RELEVANT           0.930131
PARTLY_RELEVANT    0.069869
Name: proportion, dtype: float64