In [None]:
import json
import hashlib
from collections import defaultdict
import subprocess
import ollama

In [2]:
with open('data/parsed_book.json', 'rt') as f_in:
    book_raw = json.load(f_in)

In [None]:
book_raw

In [4]:
documents = []

for chapter in book_raw:
    chapter_name = chapter['chapter']
    title = chapter['title']

    for doc in chapter['content']:
        if 'text' in doc: 
            new_doc = {
                'chapter': chapter_name,
                'title': title,
                'text': doc['text']
            }
            documents.append(new_doc) 


In [None]:
documents[0:5]

### Generate ids


In [6]:
def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['chapter']}-{doc['title']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

for doc in documents:
    doc['id'] = generate_document_id(doc)


In [None]:
documents[1]

In [8]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [None]:
len(hashes), len(documents)

In [None]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

In [None]:
hashes['005a5773']

In [12]:
with open('data/documents_with_ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

### Ollama

In [19]:
client = ollama.Client()

In [20]:
prompt = """
You emulate a data scientist who's studying technical questions to prepare technical interviews.
Formulate 5 questions this data scientist might ask based on a machine learning interviews book record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [None]:
response = client.generate(model="llama", prompt="¿Qué es el aprendizaje automático?")

In [None]:

def query_documents(documents, prompt):
    responses = []
    for doc in documents:
        response = client.chat(model="llama2", messages=[{"role": "user", "content": f"{prompt} {doc['text']}"}])
        responses.append({
            "title": doc["title"],
            "response": response['response']
        })
    return responses

results = query_documents(documents, prompt)

for result in results:
    print(f"Title: {result['title']}\nResponse: {result['response']}\n")

In [None]:
import ollama

# Generar una respuesta desde un prompt utilizando la función `generate`
response = ollama.generate(model="llama", prompt="¿Qué es el aprendizaje automático?")
print(response)


In [None]:
import ollama
print(dir(ollama))
