In [1]:
import json
import hashlib
from collections import defaultdict
import ollama
import re


# to initiate ollama on console
# ollama serve
# ollama pull llama2

In [2]:
with open('data/parsed_book.json', 'r') as f_in:
    book_raw = json.load(f_in)

type(book_raw) is a list of dicts

In [3]:
book_raw

[{'chapter': 'CHAPTER 1',
  'title': 'Machine Learning Roles and the Interview Process',
  'content': [{'text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer. Especially for new graduates 2 and readers coming from different industries, this chapter helps get everyone on the same page as well as clarifies 

Flatten the json and redistribute chapter and title

In [4]:
documents = []

for chapter in book_raw:
    chapter_name = chapter['chapter']
    title = chapter['title']

    for doc in chapter['content']:
        if 'text' in doc: 
            new_doc = {
                'chapter': chapter_name,
                'title': title,
                'text': doc['text']
            }
            documents.append(new_doc) 


In [5]:
type(documents)

list

In [6]:
documents[0:5]

[{'chapter': 'CHAPTER 1',
  'title': 'Machine Learning Roles and the Interview Process',
  'text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer. Especially for new graduates 2 and readers coming from different industries, this chapter helps get everyone on the same page as well as clarifies the process. 

In [7]:
documents = [doc for doc in documents if doc['chapter'] == 'CHAPTER 1'][:2]

In [8]:
len(documents)

2

### Generate ids


In [9]:
def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['chapter']}-{doc['title']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:10]
    return document_id

for doc in documents:
    doc['id'] = generate_document_id(doc)


In [10]:
documents

[{'chapter': 'CHAPTER 1',
  'title': 'Machine Learning Roles and the Interview Process',
  'text': 'In the first part of this chapter, I’ll walk through the structure of this book. Then, I’ll discuss the various job titles and roles that use ML skills in industry. 1 I’ll also clarify the responsibilities of various job titles, such as data scientist, machine learning engineer, and so on, as this is a common point of confusion for job seekers. These will be illustrated with an ML skills matrix and ML lifecycle that will be referenced throughout the book. The second part of this chapter walks through the interview process, from beginning to end. I’ve mentored candidates who appreciated this overview since online resources often focus on specific pieces of the interview but not how they all connect together and result in an offer. Especially for new graduates 2 and readers coming from different industries, this chapter helps get everyone on the same page as well as clarifies the process. 

In [11]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [12]:
len(hashes), len(documents)

(2, 2)

In [13]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

In [14]:
hashes['005a577345']

[]

In [15]:
with open('data/documents_with_ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

### Ollama

In [16]:
client = ollama.Client()


In [71]:
prompt = """
You are an interviewer preparing for technical interviews for a data scientist position.
Your task is to generate exactly 5 interview questions based on the following text.
The output must be only the questions, don't write an introduction or any other extra text.

The record:

chapter: {chapter}
title: {title}
text: {text}

Respond ONLY with valid JSON:
["question1", "question2", "question3", "question4", "question5"]
"""


In [72]:
# try:
#     response = client.chat(model="llama2", messages=[{"role": "user", "content": "¿Qué es la inteligencia artificial?"}])
#     print("Respuesta del modelo:", response)
# except Exception as e:
#     print(f"Error al interactuar con el modelo: {e}")

In [73]:
# response = client.chat(model="llama2", messages=[{"role": "user", "content": f"{prompt} {doc['text']}"}])
# print(response)

In [74]:
from tqdm.auto import tqdm

In [75]:
# def clean_model_response(content):
#     # Limpia el contenido eliminando cualquier prefijo innecesario
#     cleaned_content = re.sub(r'Here are \d+ interview questions based on the provided text:', '', content)
#     cleaned_content = re.sub(r'Sure! Here are \d+ interview questions based on the provided text in JSON format:', '', cleaned_content)

#     # Extrae solo el contenido dentro de los corchetes []
#     match = re.search(r'\[(.*?)\]', cleaned_content, re.DOTALL)
#     if match:
#         json_content = match.group(1)  # Captura solo el contenido de la lista
#         # Reemplaza las comillas y ajusta el formato
#         questions = re.findall(r'"([^"]+)"', json_content)  # Captura solo los strings
#         return questions  # Devuelve la lista de preguntas
#     else:
#         print(f"Error: no se encontraron preguntas válidas en el contenido limpio.")
#         return []


In [76]:
def generate_questions(doc):
    message_content = prompt.format(chapter=doc['chapter'], title=doc['title'], text=doc['text'])
    
    # Realiza la llamada al cliente para obtener la respuesta
    response = client.chat(model="llama2", messages=[{"role": "user", "content": message_content}])
    
    if 'message' in response and 'content' in response['message']:
        content = response['message']['content']
        print(f"Response content: {content}")  # Para depuración
        
        # Devuelve el contenido tal cual
        return content.strip()  # Eliminar espacios en blanco al inicio y al final
    return ""

In [77]:
results = {}

for doc in tqdm(documents): 
    doc_id = doc['id']
    questions = generate_questions(doc)  # Llama a la función para generar preguntas
    # Almacena las preguntas en el formato correcto
    results[doc_id] = questions 


  0%|          | 0/2 [00:00<?, ?it/s]

Response content: Here are 5 interview questions based on the provided text:

 question1: Can you explain the difference between a data scientist, machine learning engineer, and other related job titles?

 question2: How do you illustrate the responsibilities of various job titles using an ML skills matrix and ML lifecycle?

 question3: What is the purpose of the interview process in selecting candidates for machine learning roles, and how does it vary depending on the job title?

 question4: Can you provide examples of different types of combinations of interviews that may be used to assess a candidate's suitability for a machine learning role?

 question5: How can understanding the structure of this book help candidates prepare for their interview process, and what specific skills or knowledge should they focus on as they progress through the book?
Response content: Sure! Here are 5 interview questions based on the provided text:

question1: Can you describe the different ML roles an

In [70]:
print(f'results = {results}')

results = {'86fd49a66d': "Sure! Here are 5 interview questions for a data scientist position based on the provided text:\n\nquestion1: Can you explain the differences between a data scientist, machine learning engineer, and other related job titles? How do these roles utilize ML skills in industry?\n\nquestion2: How does the ML lifecycle matrix and job title responsibilities help candidates prepare for interviews? What specific skills or knowledge should they focus on to stand out as a candidate?\n\nquestion3: As an interviewer, how do you ensure that the interview process is comprehensive and covers all aspects of the ML role? What types of questions or activities would you use to assess a candidate's suitability for the position?\n\nquestion4: How do you address common confusion points for job seekers in the ML field, such as the differences between data science and machine learning engineering? What advice would you give to candidates on how to best present their skills and experien

In [25]:
with open('data/responses.json', 'r') as f_out:
    json.dump(results, f_out, indent=2)

UnsupportedOperation: not writable

In [24]:
with open('data/responses.json', 'r') as f_in:
    responses_ollama = json.load(f_in)

In [25]:
responses_ollama

{'86fd49a66d': [{'response': 'Here are five potential interview questions for a data scientist position, based on the provided record:\n\n1. Can you explain the difference between a data scientist and a machine learning engineer? How do these roles intersect in industry?\n2. How do you approach a problem of limited data availability or quality when working with a model? What strategies do you use to handle these challenges?\n3. How do you stay up-to-date with the latest developments and advancements in the field of machine learning, particularly in areas outside of your immediate expertise? Can you provide examples?\n4. Can you walk me through your experience with feature engineering and selection? How do you determine which features are most relevant to a given problem or model?\n5. How do you evaluate the performance of a machine learning model, particularly in situations where labeled data is scarce or difficult to obtain? What metrics do you use and why?'},
  {'response': "Here are

### Parse responses

In [27]:
parsed_results = {}

for doc_id, responses in responses_ollama.items():
    parsed_results[doc_id] = responses

In [28]:
doc_index = {d['id']: d for d in documents}

In [30]:
final_results = []

for doc_id, questions in parsed_results.items():
    title = doc_index[doc_id]['title']
    for q in questions:
        final_results.append((q, title, doc_id))

In [None]:
import pandas as pd

In [31]:
# check which columns I want 
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

In [32]:
df

Unnamed: 0,question,course,document
0,{'response': 'Here are five potential intervie...,Machine Learning Roles and the Interview Process,86fd49a66d
1,{'response': 'Here are five interview question...,Machine Learning Roles and the Interview Process,86fd49a66d
2,{'response': 'Here are five interview question...,Machine Learning Roles and the Interview Process,86fd49a66d
3,{'response': 'Here are five interview question...,Machine Learning Roles and the Interview Process,86fd49a66d
4,{'response': 'Here are five interview question...,Machine Learning Roles and the Interview Process,86fd49a66d
...,...,...,...
95,{'response': 'Here are five interview question...,Machine Learning Roles and the Interview Process,bf941d3293
96,{'response': 'Here are five interview question...,Machine Learning Roles and the Interview Process,bf941d3293
97,{'response': 'Here are five interview question...,Machine Learning Roles and the Interview Process,bf941d3293
98,{'response': 'After passing the first round of...,Machine Learning Roles and the Interview Process,bf941d3293


In [37]:
def clean_response(response_dict):
    response_text = response_dict.get('response', '')  # Extraer el texto de 'response'
    # Eliminar la parte inicial del texto
    return response_text.replace('Here are five potential interview questions for a data scientist position, based on the provided record:\n\n', '')

# Aplicar la función a la columna 'question'
df['question'] = df['question'].apply(clean_response)


In [38]:
df

Unnamed: 0,question,course,document,cleaned_response
0,1. Can you explain the difference between a da...,Machine Learning Roles and the Interview Process,86fd49a66d,1. Can you explain the difference between a da...
1,Here are five interview questions for a data s...,Machine Learning Roles and the Interview Process,86fd49a66d,Here are five interview questions for a data s...
2,Here are five interview questions based on the...,Machine Learning Roles and the Interview Process,86fd49a66d,Here are five interview questions based on the...
3,Here are five interview questions based on the...,Machine Learning Roles and the Interview Process,86fd49a66d,Here are five interview questions based on the...
4,Here are five interview questions based on the...,Machine Learning Roles and the Interview Process,86fd49a66d,Here are five interview questions based on the...
...,...,...,...,...
95,Here are five interview questions that an inte...,Machine Learning Roles and the Interview Process,bf941d3293,Here are five interview questions that an inte...
96,Here are five interview questions based on the...,Machine Learning Roles and the Interview Process,bf941d3293,Here are five interview questions based on the...
97,Here are five interview questions that an inte...,Machine Learning Roles and the Interview Process,bf941d3293,Here are five interview questions that an inte...
98,After passing the first round of technical int...,Machine Learning Roles and the Interview Process,bf941d3293,After passing the first round of technical int...


In [None]:
df.to_csv('ground_truth_data.csv', index=False)