# Baseline for an Open-Source Deep Research
This workshop will involve reasoning models and some tool usage, although no conditional execution controlled by the agent.

# Import Libraries
We'll be using Transformers and Unsloth for handling large language models,
Milvus for vector databases, LangChain and regex for text processing, and Wikipedia for external data, and json, tqdm, and typing for  general Python utilities

In [None]:
#!pip install pymilvus unsloth wikipediaapi langchain
#!pip uninstall pymilvus
!pip install pymilvus
import pymilvus
import transformers
!pip uninstall unsloth
!pip install unsloth
from unsloth import FastLanguageModel
from transformers import TextStreamer
import regex as re
!pip3 install wikipedia-api
import wikipediaapi
import json
from tqdm import tqdm
!pip install langchain_huggingface langchain_core langchain_text_splitters langchain_community langchain_milvus
from langchain_huggingface import HuggingFacePipeline as Pipeline
from langchain_huggingface import ChatHuggingFace as Chat
from langchain_core.messages import HumanMessage
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_milvus import Milvus, Zilliz
from typing import Any, Set
import json
!pip install json-repair
from json_repair import repair_json
from typing import Any, Set
!pip uninstall -U bitsandbytes
!pip install -i https://pypi.org/simple/ bitsandbytes
import bitsandbytes

Collecting pymilvus
  Downloading pymilvus-2.5.6-py3-none-any.whl.metadata (5.7 kB)
Collecting grpcio<=1.67.1,>=1.49.1 (from pymilvus)
  Downloading grpcio-1.67.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting python-dotenv<2.0.0,>=1.0.1 (from pymilvus)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting milvus-lite>=2.4.0 (from pymilvus)
  Downloading milvus_lite-2.4.12-py3-none-manylinux2014_x86_64.whl.metadata (10.0 kB)
Downloading pymilvus-2.5.6-py3-none-any.whl (223 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.4/223.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading grpcio-1.67.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m16.2 M


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


NotImplementedError: Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs!

## Load reasoning and embedding models

In [None]:
model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit" # we are using the 8b llama-distilled version of Deepseek-R1
max_seq_length = 1028
dtype = None
load_in_4bit = True # memory efficient
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)

embeddings = SentenceTransformerEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2" # sentence transformer to convert words into vectors
)

## Helper methods
Some useful methods for chatting with the reasoning model, extracting JSON from the output, and converting the JSON into a list of strings.

In [None]:
default_system_prompt = "As a helpful assistant, your goal is to provide accurate and truthful answers to the best of your ability. You should strive to be informative and helpful in your responses."

def ask_model(prompt, system_prompt=default_system_prompt):
    chat = [
    {
        "role": "system",
        "content": default_system_prompt,
    },

    {

        "role": "user",
        "content": f"{prompt}",
    },
]

    formatted_prompt = tokenizer.apply_chat_template(
        chat, tokenize=False, add_generation_prompt=True, return_tensors="pt"
    )
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

    streamer = TextStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )

    results = model.generate(**inputs, streamer=streamer, max_new_tokens=1028)
    return tokenizer.decode(token_ids=results.cpu().numpy().tolist()[0])

json_re = re.compile(r"```json\n(?s:.)*\n```")

def extract_json(response):
    try:
        match = json_re.search(response)
        json_results = '\n'.join(match.group().splitlines()[1:-1])
    except:
        return {}
    return json.loads(repair_json(json_results))

def leaves(struct: Any) -> Set[Any]:
    """Return a set of leaf values found in nested dicts and lists excluding None values."""
    # Ref: https://stackoverflow.com/a/59832594/
    values = set()

    def add_leaves(struct_: Any) -> None:
        if isinstance(struct_, dict):
            for sub_struct in struct_.values():
                add_leaves(sub_struct)
        elif isinstance(struct_, list):
            for sub_struct in struct_:
                add_leaves(sub_struct)
        elif struct_ is not None:
            values.add(struct_)

    add_leaves(struct)
    return values

## Define / refine question
Break down question into sub-questions, sub-sub-questions, and so on. Convert the main question into a report title.

In [None]:
#query = "Teach me about all the hocruxes in harry potter series, and history and details of each one."
query = ____
page_title = ______

In [None]:
prompt = f"""What is the topic of the following question? Respond in JSON format.

Question: {query}"""

response = ask_model(prompt)

In [None]:
prompt = f"""Break down the following question into intermediate sub-questions to approach answering it. Provide a list of intermediate sub-questions and respond with JSON format. If you cannot produce sub-question then say so. Do not directly answer the following question and only return the sub-questions in JSON format. Your answer must contain JSON.

Question: {query}"""

response = ask_model(prompt)

In [None]:
sub_questions = list(leaves(extract_json(response)))

In [None]:
breakdown = {}

# Hardcode topic for now from output above
#topic = "The evolution of hocruxes over time and how they influence the story of harry potter, covering changes in content, character development, animation, and its role in the story development."
topic = _____

# Break sub-questions into sub-sub-questions
for q in sub_questions:
    prompt = f"""You are researching the follow topic. Break down the following question into intermediate sub-questions to approach answering it. Provide a list of intermediate sub-questions and respond with JSON format. If you cannot produce sub-questions then say so. Do not directly answer the following question and only return the sub-questions in JSON format. Your answer must contain JSON.

    Topic: {topic}

    Question: {q}"""

    response = ask_model(prompt)
    sub_sub_questions = list(leaves(extract_json(response)))
    breakdown[q] = sub_sub_questions


In [None]:
sub_questions = list(leaves(extract_json(response)))

In [None]:
breakdown

## Vector Search with RAG

### Build vector database on Wikipedia article

In [None]:
wiki_wiki = wikipediaapi.Wikipedia(user_agent='MilvusDeepResearchBot', language='en')
page_py = wiki_wiki.page(page_title)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200) # split the data in RAG into chunks, allowing a small overlap for continuity
docs = text_splitter.create_documents([page_py.text])

In [None]:
vectorstore = Milvus.from_documents(
    documents=docs,
    embedding=embeddings,
    connection_args={
        "uri": "./milvus_demo.db", # the temporary db created to store data for vector search
    },
    drop_old=True,  # Drop the old Milvus collection if it exists
    index_params={
        "metric_type": "COSINE", # Using cosine similarity for search
        "index_type": "FLAT",  # <= NOTE: Currently a bug where langchain_milvus defaults to "HNSW" index, which doesn't work with Milvus Lite
        "params": {},
    },
)

### Break down question in sub-questions, and so on.

In [None]:
#q = 'How has the cast changed over time?'
q = ___

def question_to_header(question, topic=None):
    if topic is None:
        prompt = f"""Rewrite the following question as a header title. Be concise. Respond in JSON format with an escaped code block.

Question: {q}"""
    else:
        prompt = f"""Rewrite the following question with given context as a header title. Be concise. Respond in JSON format with an escaped code block.

Context: {topic}
Question: {q}"""

    response = ask_model(prompt)
    return list(leaves(extract_json(response)))[0]

In [None]:
header = question_to_header(q, topic)

# Building a Pipeline to answer our sub-questions

In [None]:
from langchain_core.messages import HumanMessage
import transformers
from langchain_huggingface import HuggingFacePipeline as Pipeline
from langchain_huggingface import ChatHuggingFace as Chat

FastLanguageModel.for_inference(model) # Set up the model for inference of final report

hf_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    # device="cuda",
    # repetition_penalty=1.15,
    return_full_text=False,
    max_new_tokens=2048,
    output_scores=True,
    # use_cache=False,
    # truncation=True
)

llm = Pipeline(pipeline=hf_pipeline)
chat = Chat(llm=llm)

## Analyze
Answer (sub-)sub-questions

In [None]:
# DEBUG: Without providing context
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Define the prompt template for generating AI responses
PROMPT_TEMPLATE = """
You are an AI assistant, and provides answers to questions by using fact based and statistical information when possible.
Use the following pieces of information to provide a concise answer to the question enclosed in $question$ tags.
If you don't know the answer, just say that you don't know, don't try to make up ideas or suggestions for an answer. Answer in a single short paragraph.
$context$
{context}
$/context$

$question$
{question}
$/question$
"""

# Create a PromptTemplate instance with the defined template and input variables
prompt = PromptTemplate(
    template=PROMPT_TEMPLATE, input_variables=["question"]
)
# Convert the vector store to a retriever
retriever = vectorstore.as_retriever()

# Define a function to format the retrieved documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Define the RAG (Retrieval-Augmented Generation) chain for AI response generation
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
#!pip install grandalf
#rag_chain.get_graph().print_ascii()

In [None]:
# Prompt the RAG for each question
answers = {}
total = len(leaves(breakdown)) + 4

pbar = tqdm(total=total)
for k, v in breakdown.items():
    if v == []:
        print(k)
        answers[k] = rag_chain.invoke(k).split('</think>')[-1].strip()
        pbar.update(1)
    else:
        for q in v:
            print(q)
            answers[q] = rag_chain.invoke(q).split('</think>')[-1].strip()
            pbar.update(1)

In [None]:
answers

In [None]:
import pickle
with open('answers.pkl', 'wb') as f:
    pickle.dump(answers, f)

## Synthesize

In [None]:
report = [f'# {topic}\n\n']
for k, v in breakdown.items():
    report.append(f'## {k}\n')
    if v == []:
        report.append(answers[k] + '\n\n')

    else:
        for q in v:
            report.append(f'### {q}\n')
            report.append(answers[q] + '\n\n')

In [None]:
md = ''.join(report)
with open('report.md', 'w') as f:
    print(md, file=f)

In [None]:
md