In [19]:
import os
import umap
import chromadb
import numpy as np
from pypdf import PdfReader
import matplotlib.pyplot as plt
from langchain_ollama import ChatOllama
from helper_utils import project_embeddings, word_wrap

In [20]:
reader = PdfReader("data/microsoft-annual-report.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

#Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

In [21]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
)

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1000, chunk_overlap=0
)
character_split_texts = character_splitter.split_text("\n\n".join(pdf_texts))

In [22]:
token_splitter = SentenceTransformersTokenTextSplitter(
chunk_overlap=0, tokens_per_chunk=256
)
token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

In [23]:
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()

chroma_client = chromadb.Client()
chroma_collection = chroma_client.get_or_create_collection(
    "microsoft-collection", embedding_function=embedding_function
)

In [24]:
#extract the embeddings of the token_split_texts
ids = [str(i) for i in range(len(token_split_texts))]
chroma_collection.add(ids=ids, documents=token_split_texts)
count = chroma_collection.count()
print(count)

359


In [25]:
query = "What has been the investment in research and development?"

results = chroma_collection.query(
    query_texts=query, n_results=10, include=["documents", "embeddings"]
)

retrieved_documents = results["documents"][0]

for document in results["documents"][0]:
    print(word_wrap(document))
    print("")

competitive in local markets and enables us to continue to attract top talent from acro
ss the world. we plan to continue to make significant investments in a broad range of p
roduct research and development activities, and as appropriate we will coordinate our r
esearch and development across operating segments and leverage the results across the c
ompany. in addition to our main research and development operations, we also operate mi
crosoft research. microsoft research is one of the world ’ s largest corporate research
 organizations, often working in close collaboration with top universities around the w
orld, and is focused on advancing the state - of - the - art in computer science and a 
broad range of other disciplines. our investment in fundamental research provides us a 
unique perspective on future trends and contributes to our innovation. distribution, sa
les, and marketing

our success is based on our ability to create new and compelling products, services, an
d experience

In [26]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")


In [27]:
pairs = [[query, doc] for doc in retrieved_documents]
scores = cross_encoder.predict(pairs)

print("Scores:")
for score in scores:
    print(score)

Scores:
3.4474525
-0.26803175
-7.5625663
-5.184921
-10.452858
-9.774546
-9.665048
-0.110477045
-7.090519
-3.134781


In [28]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o + 1)

New Ordering:
1
8
2
10
4
9
3
7
6
5


- Rank 1 is the 1st item (score 3.44)

- Rank 2 is the 8th item (score -0.11)

- Rank 3 is the 2nd item (score -0.26)


In [29]:
original_query = (
    "What were the most important factors that contributed to increases in revenue?"
)

generated_queries = [
    "What were the major drivers of revenue growth?",
    "Were there any new product launches that contributed to the increase in revenue?",
    "Did any changes in pricing or promotions impact the revenue growth?",
    "What were the key market trends that facilitated the increase in revenue?",
    "Did any acquisitions or partnerships contribute to the revenue growth?",
]

In [44]:
# concatenate the original query with the generated queries
queries = [original_query] + generated_queries


results = chroma_collection.query(
    query_texts=queries, n_results=10, include=["documents", "embeddings"]
)
retrieved_documents = results["documents"]

# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents = list(unique_documents)

In [47]:
pairs = []
for doc in unique_documents:
    pairs.append([original_query, doc])

scores = cross_encoder.predict(pairs)

print("Scores:")
for score in scores:
    print(score)

Scores:
-4.8165164
-10.147166
-2.796056
-9.565954
-5.777502
-10.803806
-9.333397
-11.113978
-7.3096385
-8.258858
-10.245659
-11.02249
-8.883003
-9.409262
-10.8127
-4.823413
-6.042601
-9.144857
-4.7737303
-6.4308596
-5.3444414
-4.6324058
-5.1089973
-3.3177636


In [48]:
print ("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o)

New Ordering:
2
23
21
18
0
15
22
20
4
16
19
8
9
12
17
6
13
3
1
10
5
14
11
7


In [35]:
top_indices = np.argsort(scores)[::-1][:5]
top_documents = [unique_documents[i] for i in top_indices]

# Concatenate the top documents into a single context
context = "\n\n".join(top_documents)

In [37]:
client = ChatOllama(
    model = "llama3.1",
    temperatur="0.0",
)

In [38]:
def generate_multi_query(query, context):
    prompt = f"""
    You are a knowledgeable financial research assistant.
    Your users are inquiring an annual report.
    """
    messages = [
        {"role": "system", "content": prompt,},
        {"role": "user", "content": f"based on the following context:\n\n{context}\n\nAnswer the query: '{query}'",},
    ]
    response = client.invoke(messages)
    content = response.content
    content = content.split("\n")
    return content
    

In [39]:
res = generate_multi_query(query=original_query, context=context)
print("Final Answer:")
print(res)

Final Answer:
['Based on the provided information, the most important factors that contributed to increases in revenue for Fiscal Year 2023 compared to Fiscal Year 2022 are:', '', '1. **Intelligent Cloud Revenue Growth**: This segment saw a significant increase of $12.9 billion (17%) driven by growth in Azure and other cloud services.', '2. **Azure and Other Cloud Services Growth**: Within the Intelligent Cloud segment, Azure and other cloud services revenue grew by 29%, driven by expansion in consumption-based services.', '3. **Office 365 Commercial Growth**: Office 365 commercial saw a growth of 13%, contributing to an increase in Productivity and Business Processes revenue.', '4. **LinkedIn Revenue Growth**: LinkedIn revenue increased by 10%,', '5. **Dynamics Products and Cloud Services Growth**: Dynamics products and cloud services revenue grew by 16%, driven by Dynamics 365 growth of 24%.', '', 'These factors collectively contributed to a total revenue increase of $13.6 billion (7