## minsearch

In [1]:
import pandas as pd
from minsearch import Index

df = pd.read_csv('../data/gold/data.csv')
documents = df.to_dict(orient='records')

In [2]:
index = Index(
    text_fields=["chapter", "title", "section", "text"],
    keyword_fields=["id"]
)


index.fit(documents)

<minsearch.minsearch.Index at 0x7e8c16bf7310>

In [3]:
query = 'what is the scope of a data scientist?'

index.search(query, num_results = 2)

[{'id': 1,
  'chapter': 'CHAPTER 1',
  'title': 'Machine Learning Roles and the Interview Process',
  'section': 'A Brief History of Machine Learning and Data Science Job Titles',
  'text': 'First, let’s walk through a brief history of job titles. I decided to start with this section to dispel some myths about the “data scientist” job title and shed some light on why there are so many ML-related job titles. After understanding this history, you should be more aware of what job titles to aim for yourself. If you’ve ever been confused about the litany of titles such as machine learning engineer (MLE), product data sci‐ entist, MLOps engineer, and more, this section is for you. ML techniques aren’t a new thing; in 1985, David Ackley, Geoffrey E. Hinton, and Terrence J. Sejnowski popularized the Boltzmann Machine algorithm. 3 Even before that, regression techniques 4 had early developments in the 1800s. There have long been jobs and roles that use modeling techniques to forecast and predic

## rag

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 

torch.random.manual_seed(0)




<torch._C.Generator at 0x7e8c14df5e10>

In [5]:
print("Usando dispositivo:", "GPU" if torch.cuda.is_available() else "CPU")

Usando dispositivo: CPU


In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model.eval()





GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [7]:
def search(query):
    boost = {}
    # boost = {'text': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=2
    )

    return results

search_results = search(query)
search_results

[{'id': 1,
  'chapter': 'CHAPTER 1',
  'title': 'Machine Learning Roles and the Interview Process',
  'section': 'A Brief History of Machine Learning and Data Science Job Titles',
  'text': 'First, let’s walk through a brief history of job titles. I decided to start with this section to dispel some myths about the “data scientist” job title and shed some light on why there are so many ML-related job titles. After understanding this history, you should be more aware of what job titles to aim for yourself. If you’ve ever been confused about the litany of titles such as machine learning engineer (MLE), product data sci‐ entist, MLOps engineer, and more, this section is for you. ML techniques aren’t a new thing; in 1985, David Ackley, Geoffrey E. Hinton, and Terrence J. Sejnowski popularized the Boltzmann Machine algorithm. 3 Even before that, regression techniques 4 had early developments in the 1800s. There have long been jobs and roles that use modeling techniques to forecast and predic

## Prompt


In [8]:
prompt_template = """
You are an assistant preparing a candidate for a data science interview. 
Based on the provided context, please provide a concise and accurate answer to the following question. 

QUESTION: {question}

CONTEXT:
{context}
""".strip()


In [9]:
def build_context(search_results):
    context = []
    
    for result in search_results:
        chapter = result.get("chapter", "")
        title = result.get("title", "")
        section = result.get("section", "")
        text = result.get("text", "")
        
        context.append(f"Chapter: {chapter}\nTitle: {title}\nSection: {section}\nText: {text}")
    
    return "\n\n".join(context)

context = build_context(search_results)
print(context)


Chapter: CHAPTER 1
Title: Machine Learning Roles and the Interview Process
Section: A Brief History of Machine Learning and Data Science Job Titles
Text: First, let’s walk through a brief history of job titles. I decided to start with this section to dispel some myths about the “data scientist” job title and shed some light on why there are so many ML-related job titles. After understanding this history, you should be more aware of what job titles to aim for yourself. If you’ve ever been confused about the litany of titles such as machine learning engineer (MLE), product data sci‐ entist, MLOps engineer, and more, this section is for you. ML techniques aren’t a new thing; in 1985, David Ackley, Geoffrey E. Hinton, and Terrence J. Sejnowski popularized the Boltzmann Machine algorithm. 3 Even before that, regression techniques 4 had early developments in the 1800s. There have long been jobs and roles that use modeling techniques to forecast and predict. Econome‐ tricians, statisticians, 

In [10]:
def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        chapter = doc['chapter']
        title = doc['title']
    
        section = doc['section']
        text = doc['text']
            
        context += f"Chapter: {chapter}\nTitle: {title}\nSection: {section}\nText: {text}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

prompt = build_prompt(query, search_results)
print(prompt)

You are an assistant preparing a candidate for a data science interview. 
Based on the provided context, please provide a concise and accurate answer to the following question. 

QUESTION: what is the scope of a data scientist?

CONTEXT:
Chapter: CHAPTER 1
Title: Machine Learning Roles and the Interview Process
Section: A Brief History of Machine Learning and Data Science Job Titles
Text: First, let’s walk through a brief history of job titles. I decided to start with this section to dispel some myths about the “data scientist” job title and shed some light on why there are so many ML-related job titles. After understanding this history, you should be more aware of what job titles to aim for yourself. If you’ve ever been confused about the litany of titles such as machine learning engineer (MLE), product data sci‐ entist, MLOps engineer, and more, this section is for you. ML techniques aren’t a new thing; in 1985, David Ackley, Geoffrey E. Hinton, and Terrence J. Sejnowski popularized 

## llm

In [11]:
def llm(prompt):
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True,  max_length=1023)
    with torch.no_grad():
        outputs = model.generate(**inputs, pad_token_id=tokenizer.eos_token_id, num_return_sequences=1,max_new_tokens=50) 
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


In [12]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)    
    answer = llm(prompt)

    return answer

In [13]:
print(rag(query))

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


IndexError: index out of range in self

## Retrieval evaluation

In [14]:
df_questions = pd.read_csv('../data/ground_truth_data.csv')

In [15]:
df_questions

Unnamed: 0,question,section,chapter,document
0,Can you describe the different job titles and ...,Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
1,How does the author of the chapter explain the...,Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
2,"According to the text, what is the main focus ...",Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
3,What is the purpose of the figure provided in ...,Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
4,How does the author encourage readers to appro...,Machine Learning Roles and the Interview Process,CHAPTER 1,86fd49a66d
...,...,...,...,...
224,Can you tell us about a time when you gained d...,Post-Interview and Follow-up,CHAPTER 9,2ca59d8bf2
225,How do you approach meeting relevant people du...,Post-Interview and Follow-up,CHAPTER 9,2ca59d8bf2
226,Can you describe a situation where you had to ...,Post-Interview and Follow-up,CHAPTER 9,2ca59d8bf2
227,How do you keep track of your achievements dur...,Post-Interview and Follow-up,CHAPTER 9,2ca59d8bf2


In [16]:
ground_truth = df_questions.to_dict(orient = 'records')

In [17]:
ground_truth[0]

{'question': 'Can you describe the different job titles and roles that use machine learning skills in industry?',
 'section': 'Machine Learning Roles and the Interview Process',
 'chapter': 'CHAPTER 1',
 'document': '86fd49a66d'}

In [None]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [None]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [None]:
def minsearch_search(query):
    boost = {}
    # boost = {'text': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=2
    )

    return results



In [None]:
from tqdm.auto import tqdm


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [None]:
from tqdm.auto import tqdm

In [None]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

In [None]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

## Hyperparams Optimization


In [18]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
df_val = df_questions[:100]
df_test = df_questions[100:]

### Search space

In [None]:
from hyperopt import hp

# Espacio de búsqueda para Hyperopt
space = {
    'temperature': hp.uniform('temperature', 0.5, 1.5),  # Ajuste de temperatura
    'max_new_tokens': hp.choice('max_new_tokens', [20, 50, 100]),  # Límite de tokens generados
}

### Loss function

In [None]:
def objective(params):
    temperature = params['temperature']
    max_new_tokens = params['max_new_tokens']

    # Configurar el modelo con los parametros de arriba que faltan eh?
    output_tokens = model.generate(
        **inputs, 
        pad_token_id=tokenizer.eos_token_id, num_return_sequences=num_return_sequences,max_new_tokens=max_new_tokens) 
    )
    
    generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True).strip()

    # Evaluar la calidad del texto generado con las métricas definidas
    # Asumamos que tienes una función calculate_mrr que recibe las predicciones y el ground truth
    mrr_score = mrr(generated_text)
    
    # Devuelve la métrica negativa para que Hyperopt lo minimice
    return {'loss': -mrr_score, 'status': 'ok'}


In [None]:
from hyperopt import fmin, tpe, Trials

# Inicializar el historial de pruebas
trials = Trials()

# Ejecutar la búsqueda
best = fmin(
    fn=objective,  # Función objetivo
    space=space,  # Espacio de búsqueda
    algo=tpe.suggest,  # Algoritmo de optimización
    max_evals=10,  # Número de evaluaciones
    trials=trials  # Historial de pruebas
)

print("Mejores hiperparámetros encontrados:", best)



In [None]:
gt_val = df_val.dict(orient='records')

In [None]:
evaluate(gt_val, lambda q: minsearch_search(q['question']))
# para mirar cuanto da con los mejores hyperparam 

In [None]:
def minsearch_search_optimized(query):
    # boost = {'text': 3.0, 'section': 0.5} AQUI VAN LOS HYPERPARAMS 
    
    results = index.search(
        query=query,
        filter_dict = {},
        boost_dict=boost,
        num_results=2
    )

    return results



In [None]:
evaluate(ground_truth, lambda q: minsearch_search_optimized(q['question']))
#copiar y pegar en el README
# tb los best boosting params


## RAG Evaluation

In [None]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()


In [None]:
len(ground_truth) 