# Populate RAG Index

Run this notebook to populate some data so that less online scraping is needed.

In [1]:
# logging imports
import logging
from logging import StreamHandler

# toolkit import
from kruppe.llm import OpenAIEmbeddingModel
from kruppe.functional.docstore.mongo_store import MongoDBStore
from kruppe.functional.rag.vectorstore.chroma import ChromaVectorStore
from kruppe.functional.rag.index.vectorstore_index import VectorStoreIndex
from kruppe.functional.newshub import NewsHub
from kruppe.data_source.news.nyt import NewYorkTimesData
from kruppe.data_source.news.ft import FinancialTimesData
from kruppe.data_source.news.newsapi import NewsAPIData

# console handler
ch = StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(filename)-10s - %(levelname)-8s: %(message)s')
ch.setFormatter(formatter)
ch.setLevel(logging.INFO)

# file handler for scraper
scraper_log_file_path = '/Users/danielliu/Workspace/fin-rag/logs/scraper.log'
with open (scraper_log_file_path, 'w') as f:
    pass # create the file if it doesn't exist; clear it if it does

fh = logging.FileHandler(scraper_log_file_path)
fh.setFormatter(formatter)
fh.setLevel(logging.DEBUG)

# root logger
root_logger = logging.getLogger("kruppe")
root_logger.setLevel(logging.INFO)
root_logger.addHandler(ch)

# scraper logger
scraper_logger = logging.getLogger('kruppe.data_source.scraper')
scraper_logger.setLevel(logging.DEBUG)
scraper_logger.addHandler(fh)
# propagate is true

In [2]:
reset_db=True

db_name = "kruppe_librarian"
collection_name = "general_news_04_20_2025"

# Create doc store
unique_indices = [['title', 'datasource']] # NOTE: this is important to avoid duplicates
docstore = await MongoDBStore.acreate_db(
    db_name=db_name,
    collection_name=collection_name,
    unique_indices=unique_indices,
    reset_db=reset_db
)

# Create vectorstore index
embedding_model = OpenAIEmbeddingModel()
vectorstore = ChromaVectorStore(
    embedding_model=embedding_model,
    collection_name=collection_name,
    persist_path='/Volumes/Lexar/Daniel Liu/vectorstores/kruppe_librarian'
)

if reset_db:
    vectorstore.clear()
    
index = VectorStoreIndex(vectorstore=vectorstore)



In [3]:
news_hub = NewsHub(news_sources=[
    NewYorkTimesData(headers_path="../../.nyt-headers.json"),
    FinancialTimesData(headers_path="../../.ft-headers.json"),
    NewsAPIData()
])

In [4]:
df_news, docs = await news_hub.news_recent(
    days=100,
    max_results=500,
)

2025-04-20 14:21:18,798 - ft.py      - INFO    : Fetching news feed from 2025-01-10 to 2025-04-20
2025-04-20 14:21:19,190 - newsapi.py - INFO    : Fetched 115 documents from NewsAPI API... Attempting to scrape.
2025-04-20 14:21:21,103 - nyt.py     - INFO    : NYT sleeping for 13 seconds to avoid hitting the rate limit
2025-04-20 14:21:27,883 - ft.py      - INFO    : Fetched 500 links from Financial Times... Attempting to scrape.
2025-04-20 14:21:37,398 - nyt.py     - INFO    : NYT sleeping for 13 seconds to avoid hitting the rate limit
2025-04-20 14:21:51,009 - nyt.py     - INFO    : NYT sleeping for 13 seconds to avoid hitting the rate limit
2025-04-20 14:22:05,746 - nyt.py     - INFO    : NYT sleeping for 13 seconds to avoid hitting the rate limit
2025-04-20 14:22:20,518 - nyt.py     - INFO    : NYT sleeping for 13 seconds to avoid hitting the rate limit
2025-04-20 14:22:35,171 - nyt.py     - INFO    : NYT sleeping for 13 seconds to avoid hitting the rate limit
2025-04-20 14:22:48,18

In [5]:
print(f"Found {len(docs)} documents")

saved_docs = await docstore.asave_documents(docs)
print(f"Saved {len(saved_docs)} documents to MongoDB")

await index.async_add_documents(saved_docs)
print("Finished adding documents to vectorstore index")


Found 1084 documents




Saved 1078 documents to MongoDB


2025-04-20 14:23:02,558 - chroma.py  - INFO    : Inserting documents into ChromaDB


InternalError: ValueError: Batch size of 8820 is greater than max batch size of 5461