# Testing Hypothesis Researcher

In [1]:
# logging imports
import logging
from logging import StreamHandler

# essential kruppe imports
from kruppe.llm import OpenAILLM

# toolkit import
from kruppe.llm import OpenAIEmbeddingModel
from kruppe.functional.docstore.mongo_store import MongoDBStore
from kruppe.functional.rag.vectorstore.chroma import ChromaVectorStore
from kruppe.functional.rag.index.vectorstore_index import VectorStoreIndex
from kruppe.functional.rag.retriever.simple_retriever import SimpleRetriever
from kruppe.functional.ragquery import RagQuery
from kruppe.functional.llmquery import LLMQuery
from kruppe.functional.newshub import NewsHub
from kruppe.functional.finhub import FinHub
from kruppe.data_source.news.nyt import NewYorkTimesData
from kruppe.data_source.news.ft import FinancialTimesData
from kruppe.data_source.news.newsapi import NewsAPIData
from kruppe.data_source.finance.yfin import YFinanceData

# hypothesis researcher import
from kruppe.algorithm.hypothesis import HypothesisResearcher

In [2]:
# set up logging

# handlers

# set up logging for jupyter notebook
ch = StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
ch.setLevel(logging.INFO)

# set up logging for llm
log_file_path = "/Users/danielliu/Workspace/fin-rag/logs/llm.log"
with open(log_file_path, 'w') as f:
    pass

fh_llm = logging.FileHandler(log_file_path)
fh_llm.setFormatter(formatter)
fh_llm.setLevel(logging.DEBUG)

# set up logging for scraper
log_file_path = "/Users/danielliu/Workspace/fin-rag/logs/scraper.log"
with open(log_file_path, 'w') as f:
    pass

fh_scraper = logging.FileHandler(log_file_path)
fh_scraper.setFormatter(formatter)
fh_scraper.setLevel(logging.DEBUG)

# set up loggers

root_logger = logging.getLogger("kruppe")
root_logger.setLevel(logging.INFO)
root_logger.addHandler(ch) # log to console

logger = logging.getLogger("kruppe.llm")
logger.setLevel(logging.INFO)
logger.addHandler(fh_llm) # log llm output to a file instead of console.
logger.propagate = False # prevent logging from propagating to the root logger

logger_scraper = logging.getLogger("kruppe.data_source.scraper")
logger_scraper.setLevel(logging.DEBUG)
logger_scraper.addHandler(fh_scraper) # log scraper output to a file instead of console.
logger_scraper.propagate = False

## Initializing Toolkit

Set up document store and vector store

In [3]:
reset_db=False

db_name = "kruppe_librarian"
collection_name = "general_news_04_20_2025"

# Create doc store
unique_indices = [['title', 'datasource']] # NOTE: this is important to avoid duplicates
docstore = await MongoDBStore.acreate_db(
    db_name=db_name,
    collection_name=collection_name,
    unique_indices=unique_indices,
    reset_db=reset_db
)

# Create vectorstore index
embedding_model = OpenAIEmbeddingModel()
vectorstore = ChromaVectorStore(
    embedding_model=embedding_model,
    collection_name=collection_name,
    persist_path='/Volumes/Lexar/Daniel Liu/vectorstores/kruppe_librarian'
)

if reset_db:
    vectorstore.clear()
    
index = VectorStoreIndex(vectorstore=vectorstore)

retriever = SimpleRetriever(index=index)

In [4]:
print("Number of documents:", docstore.size())
print("Number of chunks:", vectorstore.size())

Number of documents: 1078
Number of chunks: 8820


In [5]:
rag_query_engine = RagQuery(
    retriever = retriever,
    llm = OpenAILLM()
)

llm_query_engine = LLMQuery(
    llm = OpenAILLM()
)

news_hub = NewsHub(news_sources=[
    NewYorkTimesData(headers_path="../../.nyt-headers.json"),
    FinancialTimesData(headers_path="../../.ft-headers.json"),
    NewsAPIData()
])

fin_hub = FinHub(
    fin_source = YFinanceData(),
    llm = OpenAILLM()
)

In [6]:
toolkit = [
    rag_query_engine.rag_query,
    llm_query_engine.llm_query,
    news_hub.news_search,
    # news_hub.news_recent,
    # news_hub.news_archive,
    fin_hub.get_company_background,
    fin_hub.get_company_income_stmt,
    fin_hub.get_company_balance_sheet,
    fin_hub.analyze_company_financial_stmts
]

## Hypothesis Researcher

Or what I call `Research Tree` lol

### Initialization

Inputs

In [7]:
query = "How has Tesla's performance changed over the last year? (2025)"

Values that should be generated by other agentic components

In [8]:
# values that should be generated by other agents components
expert_title = "Regulatory Affairs Specialist in Automotive and Energy Sectors"
expert_desc = "The Regulatory Affairs Specialist understands the complex regulatory frameworks impacting electric vehicle manufacturers and clean energy companies worldwide. This expert assesses how changing laws and policies influence Tesla’s operations and strategy."

background = """preliminary background report on tesla, inc.

1. company overview
tesla, inc. (tsla) designs, develops, manufactures, leases, and sells electric vehicles, as well as energy generation and storage systems in the united states, china, and internationally. founded in 2003 and headquartered in austin, texas, tesla operates through two main segments: automotive (electric vehicles and related services) and energy generation and storage (solar energy and battery products). as of the latest data, tesla employs over 125,000 people globally.

2. recent news
a targeted search for recent news about tesla over the past 30 days did not return any relevant articles in the available news databases. this may be due to limited coverage or the scope of the datasets searched.

3. financial overview & analysis (past 3 years)
- tesla saw modest revenue growth in 2024, but profitability margins contracted amid rising operating expenses and a significant tax charge.
- the company maintains a strong liquidity position, highlighted by increasing cash and short-term investments, and current and quick ratios above industry norms.
- active capital investments point to ongoing capacity and innovation expansion, underpinned by consistent r&d spending.
- although long-term debt rose, tesla's debt-to-equity ratio and earnings power suggest manageable financial risk.
- operationally, inventory turnover improved, while receivables turnover fell, highlighting a potential area for closer management.
- overall, tesla demonstrates balanced growth, innovation, and financial stability, though it faces short-term challenges in profitability.

this report provides a concise snapshot of tesla’s business, recent developments, and financial health as of 2024.
"""

initialization

In [9]:
hyp_llm = OpenAILLM(model="gpt-4.1-mini")

hyp_researcher = HypothesisResearcher(
    role=expert_title,
    role_description=expert_desc,
    max_degree=2,
    max_depth=10,
    llm=hyp_llm,
    toolkit = toolkit
)



### Init Starting Node

In [10]:
root_node = await hyp_researcher.init_starting_node(query, background)


In [11]:
root_node.model_dump()

{'step': 0,
 'act_queued': False,
 'is_leaf': False,
 'd_time': None,
 'f_time': None,
 'parent': None,
 'children': [],
 'status': <Status.UNDISCOVERED: 0>}

### DFS Search

In [12]:
response = await hyp_researcher.dfs_research(root_node, query)
print(response.text)

Discovering node: Node(step=0, is_leaf=False, d_time=1, f_time=None)
Discovering node: Node(step=1, is_leaf=False, d_time=2, f_time=None)
Tool call: rag_query ({"query":"Tesla 2025 quarterly financial reports revenue margin expenses taxes production sales volumes","end_date":"2025-04-20","start_date":"2025-01-01"})
Discovering node: Node(step=2, is_leaf=False, d_time=3, f_time=None)
Tool call: news_search ({"query":"Tesla 2025 financial performance revenue profit margin expenses tax incentives regulations","max_results":10,"sort":"date"})


2025-04-20 17:27:50,849 - INFO - Fetched 1 documents from NewsAPI API... Attempting to scrape.
2025-04-20 17:27:51,219 - INFO - NYT sleeping for 13 seconds to avoid hitting the rate limit
2025-04-20 17:27:52,176 - INFO - Fetched 17 links from Financial Times... Attempting to scrape.
2025-04-20 17:28:04,224 - INFO - Fetched 10 documents from New York Times API... Attempting to scrape.


Discovering node: Node(step=3, is_leaf=False, d_time=4, f_time=None)
Tool call: rag_query ({"query":"Tesla 2025 financial results management commentary margin pressure regulatory subsidies tax impact operational efficiency","end_date":"2025-04-20","start_date":"2025-01-01"})
Discovering node: Node(step=4, is_leaf=False, d_time=5, f_time=None)
Tool call: rag_query ({"query":"Tesla 2025 production capacity manufacturing volumes sales volumes","end_date":"2025-04-20","start_date":"2025-01-01"})
Discovering node: Node(step=5, is_leaf=False, d_time=6, f_time=None)
Tool call: rag_query ({"query":"2025 regulatory changes EV incentives subsidies tariffs taxes US China EU UK","end_date":"2025-04-20","start_date":"2025-01-01"})
Discovering node: Node(step=6, is_leaf=False, d_time=7, f_time=None)
Tool call: rag_query ({"query":"Tesla 2025 operational efficiency tax strategies regulatory adaptation management commentary","end_date":"2025-04-20","start_date":"2025-01-01"})
Discovering node: Node(st