In [41]:
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os
import getpass

In [15]:
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [18]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://en.wikipedia.org/wiki/Y_Combinator",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer()
    ),
)
docs = loader.load()

In [20]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [25]:
from langchain_openai import ChatOpenAI

In [26]:
llm = ChatOpenAI(model="gpt-4o-mini")

In [21]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [22]:
prompt = "What is Y Combinator?"

In [30]:
retriever = vectorstore.as_retriever()

In [62]:
from langchain.prompts import PromptTemplate

In [64]:
prompt = PromptTemplate.from_template(
    """
    Use the following pieces of context to answer the question at the end. If you 
    don't know the answer, just say that you don't know, don't try to make up an 
    answer.

    {context}

    Question: {question}
    Answer:
    """
)

In [65]:
rag_chain = (
    RunnableParallel({"context": retriever | format_docs, "question": RunnablePassthrough()})
    | prompt
    | llm
    | StrOutputParser()
)

In [68]:
print(rag_chain.invoke("Key Topics, return in bullet points"))

- Pension funds
- Insurance companies
- Fund of funds
- Endowments
- Foundations
- Investment banks
- Merchant banks
- Commercial banks
- High-net-worth individuals
- Family offices
- Sovereign wealth funds
- Crowdfunding
- Related financial terms (AUM, Cap table, Capital call, Capital commitment, Capital structure, Distribution waterfall, EBITDA, Envy ratio, High-yield debt, IPO, IRR, Leverage, Liquidation preference, M&A, PME, Taxation of private equity and hedge funds, Undercapitalization, Vintage year)
- Startups and companies from Y Combinator (e.g., Pebble, Coinbase, DoorDash, GitLab, etc.)
- Timeline of Y Combinator companies by year (2012, 2013, 2014–2015, 2016–2018, 2019–2020, 2021–2022)


In [71]:
import os
import bs4
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

load_dotenv()

class RAGPipeline:
    def __init__(self, web_path, model_name="gpt-4o-mini", chunk_size=1000, chunk_overlap=200):
        self.web_path = web_path
        self.model_name = model_name
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.vectorstore = None
        self.llm = None
        self.retriever = None
        self.prompt = None
        self.rag_chain = None

        # Initialize LLM
        self.llm = self.initialize_llm()
        
        # Load, chunk, and index documents
        self.load_and_index_documents()
        
        # Setup prompt template
        self.setup_prompt_template()
        
        # Build the RAG chain
        self.build_rag_chain()

    def initialize_llm(self):
        # Initialize the LLM (Language Model)
        return ChatOpenAI(model=self.model_name)

    def load_and_index_documents(self):
        # Load documents from the web
        loader = WebBaseLoader(
            web_paths=(self.web_path,),
            bs_kwargs=dict(
                parse_only=bs4.SoupStrainer()
            ),
        )
        docs = loader.load()

        # Split the documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
        splits = text_splitter.split_documents(docs)

        # Create vectorstore from the document chunks
        self.vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

        # Set up the retriever
        self.retriever = self.vectorstore.as_retriever()

    def setup_prompt_template(self):
        # Define the prompt template
        self.prompt = PromptTemplate.from_template(
            """
            Use the following pieces of context to answer the question at the end. If you 
            don't know the answer, just say that you don't know, don't try to make up an 
            answer.

            {context}

            Question: {question}
            Answer:
            """
        )

    def format_docs(self, docs):
        # Format documents for use in the prompt
        return "\n\n".join(doc.page_content for doc in docs)

    def build_rag_chain(self):
        # Build RAG chain
        self.rag_chain = (
            RunnableParallel({"context": self.retriever | self.format_docs, "question": RunnablePassthrough()})
            | self.prompt
            | self.llm
            | StrOutputParser()
        )

    def run(self, question):
        # Invoke the RAG chain with a question
        return self.rag_chain.invoke(question)

In [72]:
# Example usage:
pipeline = RAGPipeline(web_path="https://en.wikipedia.org/wiki/Y_Combinator")
result = pipeline.run("Key Topics, return in bullet points")
print(result)

- Pension funds
- Insurance companies
- Fund of funds
- Endowments
- Foundations
- Investment banks
- Merchant banks
- Commercial banks
- High-net-worth individuals
- Family offices
- Sovereign wealth funds
- Crowdfunding
- Related financial terms (AUM, cap table, capital call, capital commitment, capital structure, distribution waterfall, EBITDA, envy ratio, high-yield debt, IPO, IRR, leverage, liquidation preference, M&A, PME, taxation of private equity and hedge funds, undercapitalization, vintage year)
- Startups and tech companies (e.g., Pebble, Coinbase, DoorDash, GitLab, etc.)
- Timeline of company founding years (2012, 2013, 2014-2015, 2016-2018, 2019-2020, 2021-2022) 
- Notable platforms and services (e.g., Hacker News, MiraclePlus)
