In [1]:
import pandas as pd

import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../data/gold/data.csv')
documents = df.to_dict(orient='records')

In [3]:
documents[0]

{'id': 0,
 'section': 'Overview of This Book',
 'text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer. Especially for new graduates 2 and readers coming from different industries, this chapter helps get everyone on the same page as well as clarifies the process. The interconnecting pieces of interviews ar

## Split into chunks/sentence

In [4]:
import spacy

nlp = spacy.load("es_core_news_sm") 

In [5]:
def split_into_sentence_chunks(text, base_id):
    doc = nlp(text)  
    sentences = [sent.text for sent in doc.sents] 

    chunked_texts = []
    current_chunk = []

    for sentence in sentences:
        current_chunk.append(sentence)

        if len(current_chunk) >= 3:  
            chunked_texts.append(' '.join(current_chunk))
            current_chunk = []  

    if current_chunk:
        chunked_texts.append(' '.join(current_chunk))

    chunk_ids = [f"{base_id}_{i + 1}" for i in range(len(chunked_texts))]

    return [{'chunk_id': chunk_id, 'chunk_text': chunk_text, 'text_id': base_id} 
            for chunk_id, chunk_text in zip(chunk_ids, chunked_texts)]

chunked_docs = []
for doc in documents:
    if 'text' in doc and 'text_id' in doc:  
        chunks = split_into_sentence_chunks(doc['text'], doc['text_id'])
        chunked_docs.extend(chunks)  



In [6]:
chunked_docs[0:2]

[{'chunk_id': '86fd49a66d_1',
  'chunk_text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers.',
  'text_id': '86fd49a66d'},
 {'chunk_id': '86fd49a66d_2',
  'chunk_text': 'These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer.',
  'text_id': '86fd49a66d'}]

## minsearch

In [7]:
from minsearch import Index


In [8]:
index = Index(
    text_fields=[ "chunk_text"],
    keyword_fields=["text_id", "chunk_id"]
)


index.fit(chunked_docs)

<minsearch.minsearch.Index at 0x785522502350>

In [9]:
query = 'what is the scope of a data scientist?'

def search(query, boost= None):
    if boost is None:
        boost = {}
    # boost = {'text': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=5
    )

    return results

search_results = search(query)
search_results

[{'chunk_id': '6d22154063_1',
  'chunk_text': 'Here is a nonexhaustive list of job titles for ML (or closely related) roles: • Data scientist • Machine learning engineer • Applied scientist • Software engineer, machine learning • MLOps engineer • Product data scientist • Data analyst • Decision scientist • Research scientist As I discussed “A Brief History of Machine Learning and Data Science Job Titles” on page 3 , each role is responsible for a different part of the ML lifecycle. A job title alone does not convey what the job entails. As a job seeker, be warned: in different companies, completely different titles might end up doing similar jobs!',
  'text_id': '6d22154063'},
 {'chunk_id': '6d22154063_2',
  'chunk_text': 'As illustrated in Figure 1-3 , your ML job title will depend on the company, the team, and which part(s) of the ML lifecycle your role is responsible for. To give specific examples of how job titles can depend on the company or organiza‐ tion that is hiring for the j

## rag

In [10]:
import ollama
from tqdm.auto import tqdm

# to initiate ollama on console for the first time
# ollama serve
# ollama pull llama2

In [11]:
client = ollama.Client()



## Prompt


In [12]:
prompt_template = """
You are an assistant preparing a candidate for a data science job interview. 
Based on the provided context, please provide a concise and accurate answer to the following question in plain text format without any additional formatting.

QUESTION: {question}

CONTEXT:
{context}

The answer has to be plain text
""".strip()


In [13]:
query

'what is the scope of a data scientist?'

In [14]:
def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        text = doc['chunk_text']
            
        context += f"Text: {text}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

# prompt = build_prompt(query, search_results)
# print(prompt)

In [15]:
def generate_answer(query, search_results):
    # Generar el contenido del mensaje utilizando la función build_prompt
    message_content = build_prompt(query, search_results)
    
    # Hacer la llamada al modelo Llama2
    response = client.chat(model="llama2", messages=[{"role": "user", "content": message_content}])
    
    # Verificar la respuesta y extraer el contenido
    if 'message' in response and 'content' in response['message']:
        content = response['message']['content']
        
        return content.strip()  
    return ""  


In [16]:
response = generate_answer(query, search_results)
print(response)

A data scientist's scope can vary depending on the company, team, and specific job title. However, some common responsibilities of a data scientist include:

* Working with stakeholders to gather requirements and understand their needs
* Collecting, cleaning, and analyzing large datasets using tools such as BigQuery or SQL
* Performing statistical modeling and machine learning tasks such as linear regression or logistic regression
* Building and deploying machine learning models
* Communicating findings and insights to stakeholders through reports or presentations
* Collaborating with other teams, such as engineering or product development, to integrate data science findings into products or services
* Keeping up-to-date with the latest developments in the field of data science and machine learning.


In [22]:
def rag(query):
    search_results = search(query)
    response = generate_answer(query, search_results)
    return response

    

In [23]:
query = 'Which skills are important for a data scietist?'
print(rag(query))

According to the provided context, the following skills are important for a data scientist:

1. Machine learning algorithms and data intuition
2. Programming and software engineering skills
3. Execution and communication skills
4. Domain knowledge
5. Real and relevant experience that can be applied to the job, including transferable skills
6. Soft skills, such as working well with people in the team and communicating with broader groups of people
7. Technical skills, such as individual technical contributions.


## Retrieval evaluation

In [24]:
import pandas as pd

df = pd.read_csv('../data/ground_truth_data.csv')
df

Unnamed: 0,question,section,chapter,text_id
0,Can you describe the different job titles and ...,Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
1,How does the author of the chapter explain the...,Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
2,"According to the text, what is the main focus ...",Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
3,What is the purpose of the figure provided in ...,Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
4,How does the author encourage readers to appro...,Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
...,...,...,...,...
224,Can you tell us about a time when you gained d...,Post-Interview and Follow-up,CHAPTER 9,2ca59d8bf2
225,How do you approach meeting relevant people du...,Post-Interview and Follow-up,CHAPTER 9,2ca59d8bf2
226,Can you describe a situation where you had to ...,Post-Interview and Follow-up,CHAPTER 9,2ca59d8bf2
227,How do you keep track of your achievements dur...,Post-Interview and Follow-up,CHAPTER 9,2ca59d8bf2


In [25]:
df_questions = df[['question', 'text_id']]

In [26]:
ground_truth = df_questions.to_dict(orient = 'records')
ground_truth[0]

{'question': 'Can you describe the different job titles and roles that use machine learning skills in industry?',
 'text_id': '86fd49a66d'}

In [27]:
ground_truth[0]

{'question': 'Can you describe the different job titles and roles that use machine learning skills in industry?',
 'text_id': '86fd49a66d'}

In [28]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if isinstance(line, (list, tuple)) and True in line:
            cnt += 1
        # elif line is True:
        #     cnt += 1

    return cnt / len(relevance_total)


In [29]:
def mrr(relevance_total):
    total_score = 0.0
    num_queries = len(relevance_total)

    for line in relevance_total:
        query_score = 0.0
        for rank in range(len(line)):
            if line[rank] == True:
                query_score = 1 / (rank + 1)
                break  # Solo necesitamos el primero que sea True

        total_score += query_score

    # Evitar división por cero si no hay consultas
    return total_score / num_queries if num_queries > 0 else 0.0


In [30]:
# from minsearch import Index

# index = Index(
#     text_fields=[ "text"],
#     keyword_fields=["text_id"]
# )


def minsearch_search(query):
    boost = {}
    # boost = {'text': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=5)

    return results



In [31]:
from tqdm.auto import tqdm


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['text_id']
        results = search_function(q)
        relevance = [d['text_id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [32]:
from tqdm.auto import tqdm

In [33]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/229 [00:00<?, ?it/s]

{'hit_rate': 0.777292576419214, 'mrr': 0.6283842794759823}

## Hyperparams Optimization


In [34]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [35]:
df_val = df_questions[:100]
df_test = df_questions[100:]

In [38]:
space = {
    'temperature': hp.uniform('temperature', 0.6, 1.2),   
    'top_p': hp.uniform('top_p', 0.7, 1.0),               
    'max_length': hp.quniform('max_length', 512, 1024, 1), 
    'boost': hp.uniform('boost', 0, 3),
    
}

In [40]:

def objective(params):
    print(f"Evaluating with params: {params}")
    
    # Aquí puedes usar Llama2 con los parámetros y retornar la métrica
    temperature = params['temperature']
    top_p = params['top_p']
    max_length = int(params['max_length'])
    boost_value = params['boost']
    
    # Simula la ejecución de Llama2 con los hiperparámetros
    response = rag(query['query'])

    # Calcula las métricas que deseas (hit rate y MRR)
    hit_rate_value = hit_rate(response)  # Asegúrate de que la función hit_rate esté bien definida
    mrr_value = mrr(response)
    
    # Invertir las métricas si estamos minimizando
    loss = - (hit_rate_value + mrr_value)
    
    return {'loss': loss, 'status': STATUS_OK}

# Inicializar Trials para almacenar resultados
trials = Trials()

# Ejecución de la optimización con TPE (Tree-structured Parzen Estimator)
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=10,  # Número de iteraciones a ejecutar
    trials=trials
)

# Guardar los mejores hiperparámetros
print(f"Best hyperparameters: {best}")


Evaluating with params: {'boost': 0.8687004153456102, 'max_length': 539.0, 'temperature': 0.9895199132279571, 'top_p': 0.8612587350205487}
  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

job exception: string indices must be integers



  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]


TypeError: string indices must be integers

In [None]:


best_temperature = best['temperature']
best_top_p = best['top_p']
best_max_length = int(best['max_length'])

response = rag(query)
print(response)


In [97]:
import json

with open('../data/best_hyperparams.json', 'w') as f:
    json.dump(best, f)


In [98]:
gt_val = df_val.to_dict(orient='records')

In [None]:
def minsearch_search(query, boost = None):
    if boost is None:
        boost = {}
    # boost = {'text': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=5)

    return results


In [None]:
documents[0]

In [None]:
evaluate(gt_val, lambda q: minsearch_search(q['question']))
# para mirar cuanto da con los mejores hyperparam 

In [None]:
import numpy as np

# Definir los rangos de boost
text_range = np.arange(0.1, 3.1, 0.1)  # Rango para 'text' de 0.1 a 3.0
section_range = np.arange(0.1, 1.1, 0.1)  # Rango para 'section' de 0.1 a 1.0

best_metrics = None
best_boost = None

# Iterar sobre todos los valores en los rangos
for text_boost in text_range:
    for section_boost in section_range:
        boost = {'text': text_boost, 'section': section_boost}
        
        # Realizar la búsqueda optimizada
        results = minsearch_search(query)
        print("Resultados de la búsqueda:", results)

        # Evalúa los resultados
        hit_rate_value = hit_rate(results)  # Implementa esta función para calcular el hit rate
        mrr_value = mrr(results)            # Usa tu función mrr

        # Guardar los mejores resultados
        if best_metrics is None or (hit_rate_value + mrr_value) > (best_metrics['hit_rate'] + best_metrics['mrr']):
            best_metrics = {'hit_rate': hit_rate_value, 'mrr': mrr_value}
            best_boost = boost

print(f"Mejores hiperparámetros de boost: {best_boost}, con métricas: {best_metrics}")


A little bit better :)

In [100]:
def minsearch_search_optimized(query):
    # boost = {'text': 3.0, 'section': 0.5} AQUI VAN LOS HYPERPARAMS 
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=2
    )

    return results



In [None]:
evaluate(ground_truth, lambda q: minsearch_search_optimized(q['question']))
#copiar y pegar en el README
# tb los best boosting params


## RAG Evaluation

In [None]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()


In [None]:
len(ground_truth) 