In [None]:
pip install -U langchain langchain-community faiss-cpu langchain-ollama python-dotenv docling langchain-docling

In [2]:
# Environment setup
from dotenv import load_dotenv
import os
import warnings
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")
load_dotenv()

True

In [3]:
import faiss

from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

from langchain_text_splitters import MarkdownHeaderTextSplitter

from langchain_ollama import ChatOllama, OllamaEmbeddings

from docling.document_converter import DocumentConverter

### Document conversion

In [4]:
def load_and_convert_document(file_path):
    converter = DocumentConverter()
    result = converter.convert(file_path)
    return result.document.export_to_markdown()

source = "./rag-dataset/goog-10-q-q3-2024.pdf"
markdown_content = load_and_convert_document(source)

In [5]:
print(markdown_content[:1000])

## UNITED STATES

## SECURITIES AND EXCHANGE COMMISSION

Washington, D.C. 20549

\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_

## FORM 10-Q

\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_

(Mark One)

- ☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the quarterly period ended September 30, 2024

OR

- ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the transition period from \_\_\_\_\_\_\_ to \_\_\_\_\_\_\_

Commission file number: 001-37580

\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_

## Alphabet Inc.


### Splitting markdown content into chunks

In [6]:
# Splitting markdown content into chunks
def get_markdown_splits(markdown_content):
    headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
    return markdown_splitter.split_text(markdown_content)


chunks = get_markdown_splits(markdown_content)

In [7]:
print(chunks[2].page_content)

## FORM 10-Q  
\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_  
(Mark One)  
- ☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934  
For the quarterly period ended September 30, 2024  
OR  
- ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934  
For the transition period from \_\_\_\_\_\_\_ to \_\_\_\_\_\_\_  
Commission file number: 001-37580  
\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_


### Embedding and vector store setup

In [12]:
# Embedding and vector store setup
def setup_vector_store(chunks):
    embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")
    single_vector = embeddings.embed_query("What's the embedding size?")
    # print(len(single_vector))
    # print(single_vector)
    index = faiss.IndexFlatL2(len(single_vector))
    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={}
    )
    vector_store.add_documents(documents=chunks)
    return vector_store

In [13]:
vector_store = setup_vector_store(chunks)
# Setup retriever
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={'k': 10})

In [14]:
vector_store.index.ntotal, len(chunks)

(159, 159)

In [15]:
docs = retriever.invoke('Call for applications to the Second Cycle Degree Programmes:')
docs

[Document(id='5b5c455a-5c65-492f-a486-3b0d8e8dc5e7', metadata={'Header 2': 'PART II. OTHER INFORMATION'}, page_content='## PART II. OTHER INFORMATION'),
 Document(id='e1c4d6c7-9a5f-4c3c-92a9-ea3075fde3d4', metadata={'Header 2': 'Prior Period Reclassifications'}, page_content='## Prior Period Reclassifications  \nCertain amounts in prior periods have been reclassified to conform with current period presentation.'),
 Document(id='429d62bb-7b39-4e64-bc1c-2eccad7bd23e', metadata={'Header 2': "ITEM 2. MANAGEMENT'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS"}, page_content='## ITEM 2. MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS  \nPlease read the following discussion and analysis of our financial condition and results of operations together with  "Note  About  Forward-Looking  Statements"  and  our  consolidated  financial  statements  and  related  notes included under Item 1 of this Quarterly Report on Form 10-Q as we

### Formatting documents for RAG

In [16]:
# Formatting documents for RAG
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

content = format_docs(docs)

In [17]:
print(content)

## PART II. OTHER INFORMATION

## Prior Period Reclassifications  
Certain amounts in prior periods have been reclassified to conform with current period presentation.

## ITEM 2. MANAGEMENT'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS  
Please read the following discussion and analysis of our financial condition and results of operations together with  "Note  About  Forward-Looking  Statements"  and  our  consolidated  financial  statements  and  related  notes included under Item 1 of this Quarterly Report on Form 10-Q as well as our Annual Report on Form 10-K for the fiscal year ended December 31, 2023, including Part I, Item 1A "Risk Factors," as updated in our Quarterly Report on Form 10-Q for the quarter ended June 30, 2024 and in this Quarterly Report on Form 10-Q.

## Financing  
We have a short-term debt financing program of up to $10.0 billion through the issuance of commercial paper. Net proceeds from this program are used for general corporate 

### Setting up the RAG chain

In [18]:
# Setting up the RAG chain
def create_rag_chain(retriever):
    prompt = """
        You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
        If you don't know the answer, just say that you don't know.
        Answer in bullet points. Make sure your answer is relevant to the question and it is answered from the context only.
        ### Question: {question} 
        
        ### Context: {context} 
        
        ### Answer:
    """
    model = ChatOllama(model="deepseek-r1:1.5b", base_url="http://localhost:11434")
    prompt_template = ChatPromptTemplate.from_template(prompt)

    chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt_template
        | model
        | StrOutputParser()
    )
    return chain

### Putting Everything Together

In [19]:
# One-time process

# Load document
#source = "rag-dataset/goog-10-q-q3-2024.pdf"
markdown_content = load_and_convert_document(source)
chunks = get_markdown_splits(markdown_content)

# Create vector store
vector_store = setup_vector_store(chunks)

# Setup retriever
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={'k': 1})

# Create RAG chain
rag_chain = create_rag_chain(retriever)

In [20]:
# Questions for retrieval
# question = "How much revenue is there for Google in September 2024?"
# question = "What is the net income for this quarter, and what are the key drivers contributing to its increase or decrease?"
# question = "Tell me Debt Securities estimated fair value due after 10 years? As of September 30, 2024"
question = "What are Debt Securities Total value in $"
# question = "Tell me available second cycle degree programms for this intake?"

print(f"Question: {question}")
for chunk in rag_chain.stream(question):
    print(chunk, end="", flush=True)
print("\n" + "-" * 50 + "\n")

Question: What are Debt Securities Total value in $
<think>
Alright, so I need to figure out the total value of debt securities as of September 30, 2024, based on the context provided.

First, let me look at what's given. The context mentions that as of September 30, 2024, we had $93.2 billion in cash, cash equivalents, and short-term marketable securities. These include various types like time deposits, money market funds, government bonds, corporate debt securities, mortgage-backed securities, and marketable equity securities.

Wait, but the question is specifically about debt securities, not all marketable securities. So I need to focus on the parts that are debt-related.

Looking back at the context, it doesn't mention anything directly about debt securities except for marketable equity securities. So maybe the $93.2 billion includes cash equivalents and other marketable securities but excludes debt securities like corporate debt or government debt?

Hmm, that might be a problem be