In [1]:
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain.callbacks.tracers import ConsoleCallbackHandler

from langchain_community.embeddings import OllamaEmbeddings
from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq
from langchain_community.chat_models.ollama import ChatOllama

from langchain.chains.qa_with_sources.base import BaseQAWithSourcesChain
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain.chains.graph_qa.cypher import GraphCypherQAChain

from neo4j import GraphDatabase
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from yfiles_jupyter_graphs import GraphWidget
from yfiles_jupyter_graphs_for_neo4j import Neo4jGraphWidget
from typing import List

import pandas as pd
from tqdm import tqdm
import time 

# environment variables
import os
import textwrap
from dotenv import load_dotenv

load_dotenv()

True

#### Define global variables

In [2]:
graphDB_doc2graph = "doc2graph"
graphDB_form10k = "form10k"
index_name_doc2graph = "vector"
index_name_form10k = "form_10k_chunks"

single_query = "What's Nvidia's revenue growth"

In [3]:
embeddings_doc2graph = OllamaEmbeddings(model="znbang/bge:large-en-v1.5-f16")
embeddings_form10k = OllamaEmbeddings(model="mxbai-embed-large:latest")

llm_rag_groq = ChatGroq(model='mixtral-8x7b-32768', temperature=0)    # mixtral-8x7b-32768, llama3-70b-8192, llama3-8b-8192
# llm_rag_groq = ChatGroq(temperature=0)    # mixtral-8x7b-32768, llama3-70b-8192, llama3-8b-8192
llm_rag_ollama = ChatOllama(model='mixtral:latest', temperature=0)     # qwen2:7b-instruct-fp16, mixtral:latest, llama3:8b-instruct-fp16

llm = llm_rag_groq

## Vector retriever 
#### Define vectorIndex for doc2graph and form10k, and similarity search function

In [4]:
vectorIndex_doc2graph = Neo4jVector.from_existing_graph(
    url=os.getenv("NEO4J_URL"), 
    username=os.getenv("NEO4J_USERNAME"), 
    password=os.getenv("NEO4J_PASSWORD"), 
    database=graphDB_doc2graph,
    embedding=embeddings_doc2graph,
    index_name=index_name_doc2graph,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

vectorIndex_form10k = Neo4jVector.from_existing_graph(
    url=os.getenv("NEO4J_URL"), 
    username=os.getenv("NEO4J_USERNAME"), 
    password=os.getenv("NEO4J_PASSWORD"), 
    database=graphDB_form10k,
    embedding=embeddings_form10k,
    index_name=index_name_form10k,
    search_type="hybrid",
    node_label="Chunk",
    text_node_properties=["text"],
    embedding_node_property="textEmbedding"
)

# Interchangable with ```vectorIndex.similarity_search```
def neo4j_vector_search(question, kg_vector, index_name, embedding, k):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    CALL db.index.vector.queryNodes($index_name, $top_k, $question_embedding) yield node, score
    RETURN score, node.chunkId as chunkId, node.text AS text
  """
  similar = kg_vector.query(vector_search_query, 
                     params={ 
                      'index_name':index_name, 
                      'top_k': k,
                      'question_embedding': embedding.embed_query(question)
                      })
  return similar

In [5]:
print(vectorIndex_doc2graph.similarity_search(single_query, k=1))
print(neo4j_vector_search(single_query, vectorIndex_doc2graph, index_name_doc2graph, embeddings_doc2graph, 1))

[Document(page_content='\ntext: Table of Contents\nNVIDIA  Corporation and Subsidiaries\nNotes to the Consolidated Financial Statements\nNote 1 - Organization and Summary of Significant Accounting Policies\nOur Company\nHeadquartered in Santa Clara, California, NVIDIA  was incorporated in California in April 1993 and reincorporated in Delaware in April 1998.\nAll references to “NVIDIA,” “we,” “us,” “our” or the “Company” mean NVIDIA  Corporation and its subsidiaries.\nFiscal Year\nWe operate on a 52- or 53-week year , ending on the last Sunday in January . Fiscal years 2024, 2023 and 2022 were all 52-week years.\nPrinciples of Consolidation\nOur consolidated financial statements include the accounts of NVIDIA  Corporation and our wholly-owned subsidiaries. All intercompany balances and\ntransactions have been eliminated in consolidation.\nUse of Estimates\nThe preparation of finan cial statements in conformity with U.S. GAAP  requires management to make estimates and assumptions that a

In [6]:
print(vectorIndex_form10k.similarity_search(single_query, k=1))
print(neo4j_vector_search(single_query, vectorIndex_form10k, index_name_form10k, embeddings_form10k, 1))

[Document(page_content='\ntext: See accompanying notes to the consolidated financial statements. \n\nNVIDIA Corporation and Subsidiaries \n\nConsolidated Statements of Comprehensive Income \n\n(In millions) ##TABLE_START Year Ended Jan 28, 2024 Jan 29, 2023 Jan 30, 2022 Net income $ 29,760 &#160; $ 4,368 &#160; $ 9,752 &#160; Other comprehensive income (loss), net of tax Available-for-sale securities: Net change in unrealized gain (loss) 80 &#160; ( 31 ) ( 16 ) Reclassification adjustments for net realized gain included in net income &#8212; &#160; 1 &#160; &#8212; &#160; Net change in unrealized gain (loss) 80 &#160; ( 30 ) ( 16 ) Cash flow hedges: Net change in unrealized gain (loss) 38 &#160; 47 &#160; ( 43 ) Reclassification adjustments for net realized gain (loss) included in net income ( 48 ) ( 49 ) 29 &#160; Net change in unrealized loss ( 10 ) ( 2 ) ( 14 ) Other comprehensive income (loss), net of tax 70 &#160; ( 32 ) ( 30 ) Total comprehensive income $ 29,830 &#160; $ 4,336 &#

#### RAG retriever based on vector Search

In [7]:
# only for RetrievalQAWithSourcesChain
retriever_doc2graph = vectorIndex_doc2graph.as_retriever(k=2)
retriever_form10k = vectorIndex_form10k.as_retriever(k=2)

# vecCombined_retriever
def vecCombined_retriever(question: str) -> str:
    vecRAG_doc2graph = vectorIndex_doc2graph.similarity_search(question, k=2)
    # print(vecRAG_doc2graph)
    vecRAG_form10k = vectorIndex_form10k.similarity_search(question, k=2)
    # print(vecRAG_doc2graph)
    final_data = f"""
    vecRAG_doc2graph: 
    {vecRAG_doc2graph}\n\n
    vecRAG_form10k:
    {vecRAG_form10k}
    """
    # print(final_data)
    return final_data   

## Graph retriever
#### Named Entity Recognition (NER)

In [8]:
# connect to Neo4j
graph_doc2graph = Neo4jGraph(
    url=os.getenv("NEO4J_URL"), 
    username=os.getenv("NEO4J_USERNAME"), 
    password=os.getenv("NEO4J_PASSWORD"), 
    database=graphDB_doc2graph,
)
graph_doc2graph

<langchain_community.graphs.neo4j_graph.Neo4jGraph at 0x13fc50400>

In [52]:
graph_doc2graph.query(
    "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

# Extract entities from text
class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the persons, organizations, or companies, locations, "
         "and other named entities that appear in the text",
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting all named general entities and financial concepts that appear in the text.",
            # "You are extracting organization, person, company, financial entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {input}",
        ),
    ]
)

entity_extract_chain = prompt | llm.with_structured_output(Entities)
entity_extract_chain.invoke("Nvidia's major markets are Data Center, Gaming, Professional Visualization, and Automotive").names

['Data Center', 'Gaming', 'Professional Visualization', 'Automotive']

In [51]:
entity_extract_chain.invoke("Who are Nvidia's major collaboraters and competitors").names

['Nvidia', 'collaborators', 'competitors']

In [10]:
# function that will generate full-text queries 
def generate_full_text_query(input: str) -> str:
    """
    Generate a full text query for the given input string.

    This function constructs a query string suitable for a full-text search.
    It process the input string by splitting it into words and appending a 
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to knowledge graph nodes, and allow for some misspelling in the query.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f"{word}~2 AND "
    full_text_query += f"{words[-1]}~2"

    return full_text_query.strip()

In [11]:
# Full text index query
def kgNER_retriever(question: str) -> str:
    """
    Collects the neibourhood of entities mentioned in a question.
    """

    entities = entity_extract_chain.invoke({"input": question})
    print(entities)
    for entity in entities.names:
        response = graph_doc2graph.query(
            """
            CALL db.index.fulltext.queryNodes('entity', $query, {limit:5}) 
            YIELD node, score
            CALL {
                WITH node
                MATCH p1=(node)-[r *1..2]-()
                WHERE NONE(rel in r WHERE type(rel)="MENTIONS") 
                RETURN p1 AS output
                //UNION ALL
                //WITH node
                //MATCH p2=(node)<-[r *1..2]-()
                //WHERE NONE(rel in r WHERE type(rel)="MENTIONS") 
                //RETURN p2 AS output
            }
            RETURN output
            LIMIT 30
            """,
            {"query": generate_full_text_query(entity)},
        )
        # print(response)
        # result += "\n".join(el['output'] for el in response)
    
    return response

In [53]:
print(kgNER_retriever("Who are Nvidia's major collaboraters and competitors"))


names=['Nvidia', 'collaborators', 'competitors']
[{'output': [{'id': 'Competitors'}, 'WORK_FOR', {'id': 'Employees'}]}, {'output': [{'id': 'Competitors'}, 'WORK_FOR', {'id': 'Employees'}, 'RECIPIENT', {'id': 'Other Stock-Based Awards'}]}, {'output': [{'id': 'Competitors'}, 'WORK_FOR', {'id': 'Employees'}, 'RECIPIENT', {'id': 'Performance Cash Awards'}]}, {'output': [{'id': 'Competitors'}, 'WORK_FOR', {'id': 'Employees'}, 'RECIPIENT', {'id': 'Rsus'}]}, {'output': [{'id': 'Competitors'}, 'WORK_FOR', {'id': 'Employees'}, 'RECIPIENT', {'id': 'Stock Appreciation Rights'}]}, {'output': [{'id': 'Competitors'}, 'WORK_FOR', {'id': 'Employees'}, 'RECIPIENT', {'id': 'Performance Stock Awards'}]}, {'output': [{'id': 'Competitors'}, 'WORK_FOR', {'id': 'Employees'}, 'RECIPIENT', {'id': 'Stock Options'}]}, {'output': [{'id': 'Competitors'}, 'WORK_FOR', {'id': 'Employees'}, 'RECIPIENT', {'id': 'Non-Statutory Stock Options'}]}, {'output': [{'id': 'Competitors'}, 'WORK_FOR', {'id': 'Employees'}, 'RECIPI

In [12]:
print(kgNER_retriever("Nvidia's major markets are Data Center, Gaming, Professional Visualization, and Automotive"))

names=['Data Center', 'Gaming', 'Professional Visualization', 'Automotive']
[{'output': [{'id': 'Automotive'}, 'REVENUE', {'id': '2023'}]}, {'output': [{'id': 'Automotive'}, 'REVENUE', {'id': '2023'}, 'REVENUE', {'id': 'Gaming'}]}, {'output': [{'id': 'Automotive'}, 'REVENUE', {'id': '2023'}, 'REVENUE', {'id': 'Professional Visualization'}]}, {'output': [{'id': 'Automotive'}, 'REVENUE', {'id': '2023'}, 'REVENUE', {'id': 'Data Center'}]}, {'output': [{'id': 'Automotive'}, 'REVENUE', {'id': '2023'}, 'REVENUE', {'id': 'Oem And Other'}]}, {'output': [{'id': 'Automotive'}, 'REVENUE', {'id': '2023'}, 'FISCAL_YEAR', {'id': 'Taxation'}]}, {'output': [{'id': 'Automotive'}, 'REVENUE', {'id': '2022'}]}, {'output': [{'id': 'Automotive'}, 'REVENUE', {'id': '2022'}, 'REVENUE', {'id': 'Professional Visualization'}]}, {'output': [{'id': 'Automotive'}, 'REVENUE', {'id': '2022'}, 'REVENUE', {'id': 'Oem And Other'}]}, {'output': [{'id': 'Automotive'}, 'REVENUE', {'id': '2022'}, 'REVENUE', {'id': 'Gaming'}

In [13]:
print(kgNER_retriever(single_query))

names=['Nvidia']
[{'output': [{'id': 'Nvda'}, 'RELATED_TO', {'id': 'Issuer Purchases'}]}, {'output': [{'id': 'Nvda'}, 'LISTED_ON', {'id': 'Nasdaq Global Select Market'}]}, {'output': [{'id': 'Nvda'}, 'LISTED_ON', {'id': 'Nasdaq Global Select Market'}, 'ON', {'id': 'July 28, 2023'}]}, {'output': [{'id': 'Nvidia Vgpu'}, 'DEVELOPER', {'id': 'Nvidia Corporation'}]}, {'output': [{'id': 'Nvidia Vgpu'}, 'DEVELOPER', {'id': 'Nvidia Corporation'}, 'FOUNDER', {'id': 'Jen-Hsun Huang'}]}, {'output': [{'id': 'Nvidia Vgpu'}, 'DEVELOPER', {'id': 'Nvidia Corporation'}, 'CEO', {'id': 'Jen-Hsun Huang'}]}, {'output': [{'id': 'Nvidia Vgpu'}, 'DEVELOPER', {'id': 'Nvidia Corporation'}, 'CEO', {'id': 'Jen-Hsun Huang'}]}, {'output': [{'id': 'Nvidia Vgpu'}, 'DEVELOPER', {'id': 'Nvidia Corporation'}, 'CEO', {'id': 'Jen-Hsun Huang'}]}, {'output': [{'id': 'Nvidia Vgpu'}, 'DEVELOPER', {'id': 'Nvidia Corporation'}, 'ACQUIRED', {'id': 'Mellanox'}]}, {'output': [{'id': 'Nvidia Vgpu'}, 'DEVELOPER', {'id': 'Nvidia Corp

## Final combined retriever
Combine the vector and graph retriever to create the final context that will be passed to an LLM.

In [14]:
def final_combined_retriever(question: str) -> str:
    # print(f'Search query: {question}')   
    vecRAG_combined = vecCombined_retriever(question)
    # print(vecRAG_combined)  ## 
    graphRAG = kgNER_retriever(question)
    # print(graphRAG)  ## 
    final_data = f"""
    {vecRAG_combined}\n\n
    graphRAG:
    {graphRAG}
    """
    # print(final_data)
    return final_data   

## RAG Chain
#### initialize the RAG chains with the default settings

In [15]:
# Prompt
template = """Answer the question based only on the following context:
{context}
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
If you have the answer, you MUST ALWAYS:
    - keep it in bulletpoints and concise in markdown format, 
    - also give a short explanation of your answer and citation at the end of your answer.

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

def gen_RAG_chain(retriever) -> BaseQAWithSourcesChain:
    chain = (
        RunnableParallel(
            {
                "context": retriever,
                "question": RunnablePassthrough(),
            }
        )
        | prompt
        | llm   
        | StrOutputParser()
    )
    return chain

In [16]:
# single vector RAG chain
vecRAG_chain_doc2graph = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm_rag_groq,  # llm_rag_groq, llm_rag_ollama
    chain_type="stuff", 
    retriever=retriever_doc2graph,
)

vecRAG_chain_form10k = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm_rag_groq,  # llm_rag_groq, llm_rag_ollama
    chain_type="stuff", 
    retriever=retriever_form10k,
)
# vecRAG_chain_doc2graph = gen_RAG_chain(retriever_doc2graph)
# vecRAG_chain_form10k = gen_RAG_chain(retriever_form10k)

# combined vector RAG chain
vegRAG_chain_combined = gen_RAG_chain(vecCombined_retriever)

# graph NER RAG chain
kgNER_chain = gen_RAG_chain(kgNER_retriever)

# vector-graph RAG chain
finalCombined_chain = gen_RAG_chain(final_combined_retriever)


In [17]:
vecRAG_chain_doc2graph, vecRAG_chain_form10k, vegRAG_chain_combined, kgNER_chain, finalCombined_chain

 {
   context: RunnableLambda(vecCombined_retriever),
   question: RunnablePassthrough()
 }
 | ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="Answer the question based only on the following context:\n{context}\nIf you don't know the answer, just say that you don't know. Don't try to make up an answer.\nIf you have the answer, you MUST ALWAYS:\n    - keep it in bulletpoints and concise in markdown format, \n    - also give a short explanation of your answer and citation at the end of your answer.\n\nQuestion: {question}\n"))])
 | ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x13fc22a10>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x13fc23910>, temperature=1e-08, groq_api_key=SecretStr('**********'))
 | StrOutputParser(),
 {
   context: RunnableLambda(kgNER_retriever),
   question: RunnablePassthrough()
 }
 | Ch

In [18]:
result = vecRAG_chain_form10k.invoke(single_query)

In [19]:
type(result), result

(dict,
 {'question': "What's Nvidia's revenue growth",
  'answer': "Nvidia's revenue for fiscal year 2024 was $60.9 billion, up 126% from the previous year. The company's Data Center revenue for fiscal year 2024 was up 217%, driven by strong demand for enterprise software and consumer internet applications, and multiple industry verticals including automotive, financial services, and healthcare. Customers across industry verticals access NVIDIA AI infrastructure both through the cloud and on-premises. Data Center compute revenue was up 244% in the fiscal year. Networking revenue was up 133% in the fiscal year. Gaming revenue for fiscal year 2024 was up 15%. The increase reflects higher sell-in to partners following the normalization of channel inventory levels and growing demand. Professional Visualization revenue for fiscal year 2024 was up 1%. Automotive revenue for the fiscal year 2024 was up 21%. The increase primarily reflected growth in self-driving platforms. Gross margin increa

In [20]:
result = vegRAG_chain_combined.invoke(single_query)

In [21]:
type(result), result

(str,
 "- Nvidia's revenue for fiscal year 2024 was $60.9 billion, up 126% from a year ago.\n- Data Center revenue for fiscal year 2024 was up 217%.\n- Gaming revenue for fiscal year 2024 was up 15%.\n- Professional Visualization revenue for fiscal year 2024 was up 1%.\n- Automotive revenue for the fiscal year 2024 was up 21%.\n\nExplanation:\nThe revenue growth for Nvidia can be seen in the increase in revenue for each of its market platforms. The largest growth was seen in Data Center revenue, which was up 217% in fiscal year 2024. Gaming revenue was up 15%, Professional Visualization revenue was up 1%, and Automotive revenue was up 21%. The overall revenue growth for Nvidia was 126% in fiscal year 2024.\n\nCitation:\nNVIDIA Corporation and Subsidiaries. (2024). NVIDIA Corporation. Annual Report (Form 10-K), p. 54. Retrieved from <https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/nvda-20240128.htm>")

In [22]:
result = kgNER_chain.invoke(single_query)

names=['Nvidia']


In [23]:
type(result), result

(str,
 "- Nvidia's revenue growth can be tracked through the 'Issuer Purchases' output in the provided context.\n- On July 28, 2023, Nvidia was listed on the Nasdaq Global Select Market and had a revenue of $11.72 billion, an increase of 66.1% compared to the same quarter in the previous year.\n- Nvidia's revenue growth can also be seen in the 'Inventories' output, which shows that Nvidia owns inventories worth $3.23 billion as of July 28, 2023, an increase of 52.3% compared to the same quarter in the previous year.\n\nSources:\n[{'output': [{'id': 'Nvda'}, 'RELATED_TO', {'id': 'Issuer Purchases'}]}, {'output': [{'id': 'Nvda'}, 'LISTED_ON', {'id': 'Nasdaq Global Select Market'}]}, {'output': [{'id': 'Nvda'}, 'LISTED_ON', {'id': 'Nasdaq Global Select Market'}, 'ON', {'id': 'July 28, 2023'}]}, {'output': [{'id': 'Nvidia Vgpu'}, 'DEVELOPER', {'id': 'Nvidia Corporation'}]}, {'output': [{'id': 'Nvidia Vgpu'}, 'DEVELOPER', {'id': 'Nvidia Corporation'}, 'FOUNDER', {'id': 'Jen-Hsun Huang'}]}, 

In [24]:
result = finalCombined_chain.invoke(single_query)

names=['Nvidia']


In [25]:
type(result), result

(str,
 "* Nvidia's revenue for fiscal year 2024 was $60.9 billion, up 126% from a year ago.\n* Data Center revenue for fiscal year 2024 was up 217%.\n* Gaming revenue for fiscal year 2024 was up 15%.\n* Professional Visualization revenue for fiscal year 2024 was up 1%.\n* Automotive revenue for the fiscal year 2024 was up 21%.\n\nSource:\n\n* [NVIDIA Corporation and Subsidiaries Consolidated Financial Statements for the Fiscal Year Ended January 28, 2024](https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/nvda-20240128.htm)\n* [NVIDIA Corporation and Subsidiaries Consolidated Statements of Comprehensive Income for the Fiscal Year Ended January 28, 2024](https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/nvda-20240128.htm)\n* [NVIDIA Corporation and Subsidiaries Consolidated Balance Sheets for the Fiscal Year Ended January 28, 2024](https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/nvda-20240128.htm)\n* [NVIDIA Corporation and Subsidiarie

# Main

In [None]:
def prettychain(question: str, chain) -> str:
    """Pretty print the chain's response to a question"""
    result = chain.invoke(question)
    if isinstance(result, dict):
        response = result['answer']
    elif isinstance(result, str):
        response = result
    print("    "+textwrap.fill(response, 120))
    return response

In [26]:
# artificial queries
sample_queries1 = [
    "Where is Nvidia headquartered?",
    "What is Nvidia primary business?",
    "What's Nvidia's revenue growth?",
    "Nvidia's revenue and gross margin growth.",
    "What's Nvidia's major markets",
    "Nvidia's key financial indicators in fiscal year 2024",
    "What are major factors to Nvidia's Data Center financial performance and why",
    "How does China impact Nvidia's financial indicators in fiscal year 2024",
    "What financial assets does Nvidia Corporation have?",
    "What can we know about Nvidia's Data Center?",
    "What insights of demand can we get from `IMPACTS` as relationship?",
    "How are Nvidia Corporation and Registrant related?",
    "What insights can we get from `Stock award`?",
    "What insights can we get from `IMPOSE` as relationship?",
    "How are Nvidia and Xiaomi related",
]

len(sample_queries1)

15

### queries generated by ChatGPT-4o
##### query set 1

In [27]:
# Simple Queries (Suitable for Pure Vector RAG)
simple_queries = [
    "What was Nvidia’s total revenue in fiscal year 2024?",
	"What are Nvidia’s major markets?",
	"How much did Nvidia spend on R&D in fiscal year 2024?",
	"What is Nvidia’s net income for fiscal year 2024?",
	"List the key financial indicators for Nvidia in fiscal year 2024.",
	"What is Nvidia’s market share in the data center segment?",
	"How did Nvidia’s gaming revenue perform in fiscal year 2024?",
	"What are the main products of Nvidia in the automotive market?",
	"How many employees does Nvidia have?",
	"What is the gross profit margin for Nvidia in fiscal year 2024?",
]

# Moderate Complexity Queries (Suitable for Graph RAG)
moderate_queries = [
    "Describe the impact of U.S. export restrictions on Nvidia’s business.",
	"What are the primary reasons for the decline in Nvidia’s gaming revenue?",
	"How has Nvidia’s data center business grown in the past year?",
	"Explain the importance of Nvidia’s new product releases in fiscal year 2024.",
	"What are the challenges Nvidia faces in the Chinese market?",
	"Summarize Nvidia’s strategy for handling supply chain disruptions.",
	"How does Nvidia’s expenditure on sales and marketing compare to its R&D spending?",
	"What are the regulatory impacts on Nvidia’s gaming segment in China?",
	"Describe Nvidia’s approach to developing AI and large language models.",
	"How has Nvidia’s professional visualization market been performing?",
]

# High Complexity Queries (Suitable for Combined RAG from Both Vector and Graph)
complex_queries = [
    "Analyze the combined effects of economic conditions and regulatory changes on Nvidia’s overall performance.",
	"How does Nvidia’s financial performance in fiscal year 2024 compare to its performance in the previous year?",
	"What are the specific contributions of new product innovations to Nvidia’s revenue growth?",
	"Detail Nvidia’s competitive landscape in the data center market.",
	"How have geopolitical tensions influenced Nvidia’s strategic decisions?",
	"Explain the impact of Nvidia’s acquisition costs on its operating expenses.",
	"What are the potential risks and opportunities for Nvidia in the AI market?",
	"How has Nvidia’s capital return strategy affected its financial stability?",
	"Assess Nvidia’s long-term growth prospects in the context of current market challenges.",
	"Describe the interplay between Nvidia’s revenue growth and its investments in emerging technologies.",
]

sample_queries2 = simple_queries + moderate_queries + complex_queries
len(sample_queries2)

30

##### query set 2

In [28]:
# Basic Queries (Simple Fact Retrieval)
basic_queries = [
    "What was Nvidia’s total revenue in FY 2024?",
	"How much did Nvidia spend on Research and Development in FY 2024?",
	"What is the name of Nvidia’s latest GPU architecture?",
	"Who are Nvidia’s major customers in the Data Center segment?",
	"What are the primary markets Nvidia serves?",
]

# Intermediate Queries (Fact Synthesis and Relation Extraction)
intermediate_queries = [
    "Describe the impact of the U.S. export restrictions on Nvidia’s sales to China.",
	"How has Nvidia’s Gaming revenue changed in FY 2024 compared to the previous year?",
	"What are the key products in Nvidia’s Professional Visualization segment?",
	"Explain Nvidia’s strategy for mitigating the impact of supply chain disruptions.",
	"What new products did Nvidia introduce in FY 2024?",
]

# Advanced Queries (Complex Relationships and Analysis)
advanced_queries = [
    "How did Nvidia’s acquisition termination costs affect their financial performance?",
	"Compare Nvidia’s revenue growth in the Data Center and Automotive segments.",
	"What measures has Nvidia taken to comply with regulatory changes in China?",
	"Analyze the trend in Nvidia’s operating expenses over the last three fiscal years.",
	"How has Nvidia’s approach to AI and machine learning evolved in FY 2024?",
]

# Vector RAG-Specific Queries
vector_rag_queries = [
    "List the major factors driving growth in Nvidia’s Data Center segment.",
	"What challenges did Nvidia face in the Gaming segment in FY 2024?",
	"Summarize Nvidia’s financial performance in FY 2024 in terms of net income and gross profit.",
	"Identify Nvidia’s key financial indicators and their significance.",
	"What were the major economic conditions affecting Nvidia in FY 2024?",
]

# Graph RAG-Specific Queries (Cypher Queries)
graph_rag_queries = [
    "Find the relationships between Nvidia’s different business segments and their respective revenue contributions.",
	"Identify the interconnections between Nvidia’s new product launches and their market impact.",
	"Trace the regulatory changes in China and their effects on Nvidia’s sales strategy.",
	"Map out the supply chain disruptions and their cascading effects on Nvidia’s logistics.",
	"Explore the dependencies between Nvidia’s R&D investments and their product innovation cycle.",
]

# Combined RAG Queries (Complex Multi-Source Queries)
combined_rag_queries = [
    "How have Nvidia’s partnerships and collaborations influenced their market position in the Data Center segment?",
	"What are the long-term implications of Nvidia’s investment in AI technologies for their financial health?",
	"Analyze the combined impact of economic conditions, regulatory changes, and supply chain issues on Nvidia’s overall performance.",
	"How do Nvidia’s efforts in sustainability and environmental responsibility reflect in their financial statements?",
	"Provide a comprehensive overview of Nvidia’s strategic initiatives and their outcomes in FY 2024.",
]

sample_queries3 = basic_queries + intermediate_queries + advanced_queries + vector_rag_queries + graph_rag_queries + combined_rag_queries
len(sample_queries3)

30

In [29]:
# initialize dataframe
df_rag = pd.DataFrame(columns=["query", "level", "vec_doc2graph", "vec_form10k", "vec_combined", "kg_ner", "kg_2cypher", "kg_combined", "vec_kg_combined"])
# df_rag["query"] = sample_queries1

# df_rag["query"] = sample_queries2
# df_rag.loc[:10, "level"] = "basic"
# df_rag.loc[10:20, "level"] = "moderate"
# df_rag.loc[20:, "level"] = "complex"

df_rag["query"] = sample_queries3
df_rag.loc[:5, "level"] = "basic"
df_rag.loc[5:10, "level"] = "intermediate"
df_rag.loc[10:15, "level"] = "advanced"
df_rag.loc[15:20, "level"] = "vector_rag"
df_rag.loc[20:25, "level"] = "graph_rag"
df_rag.loc[25:, "level"] = "combined"
df_rag

Unnamed: 0,query,level,vec_doc2graph,vec_form10k,vec_combined,kg_ner,kg_2cypher,kg_combined,vec_kg_combined
0,What was Nvidia’s total revenue in FY 2024?,basic,,,,,,,
1,How much did Nvidia spend on Research and Deve...,basic,,,,,,,
2,What is the name of Nvidia’s latest GPU archit...,basic,,,,,,,
3,Who are Nvidia’s major customers in the Data C...,basic,,,,,,,
4,What are the primary markets Nvidia serves?,basic,,,,,,,
5,Describe the impact of the U.S. export restric...,intermediate,,,,,,,
6,How has Nvidia’s Gaming revenue changed in FY ...,intermediate,,,,,,,
7,What are the key products in Nvidia’s Professi...,intermediate,,,,,,,
8,Explain Nvidia’s strategy for mitigating the i...,intermediate,,,,,,,
9,What new products did Nvidia introduce in FY 2...,intermediate,,,,,,,


In [30]:
for i, query in enumerate(sample_queries3):
    print(i, query)
    print(">>> vecRAG doc2graph")
    df_rag.loc[i, "vec_doc2graph"] = prettychain(query, vecRAG_chain_doc2graph)
    print(">>> vecRAG form10k")
    df_rag.loc[i, "vec_form10k"] = prettychain(query, vecRAG_chain_form10k)
    print(">>> vecRAG combined")
    time.sleep(60)
    df_rag.loc[i, "vec_combined"] = prettychain(query, vegRAG_chain_combined)
    print("\n")

0 What was Nvidia’s total revenue in FY 2024?
>>> vecRAG doc2graph
    Nvidia's total revenue for FY 2024 was $60,922 million.
>>> vecRAG form10k
    Nvidia's total revenue in FY 2024 was $29,830 million.
>>> vecRAG combined
    - Nvidia's total revenue in FY 2024 was $60,922 million. - This information can be found in the document on page 78, in
the section "Revenue by geographic areas." - The revenue by geographic areas is designated based on the billing location
of the customer. - Revenue from sales to customers outside of the United States accounted for 56% of total revenue for
fiscal year 2024. - The document also provides information on Nvidia's revenue by specialized markets, with Data Center
revenue being $47,525 million in FY 2024. - The document is a 10-K filing by Nvidia Corporation with the SEC.


1 How much did Nvidia spend on Research and Development in FY 2024?
>>> vecRAG doc2graph
    Nvidia did not disclose the amount they spent on Research and Development in FY 2024 i

In [31]:
df_rag.to_csv("Nvidia_form10k_df_rag_groq_mixtral_QAchain_sample3.csv", index=False)

In [34]:
# time.sleep(60)
for i, query in enumerate(sample_queries3):
    print(i, query)    
    print(">>> kg ner RAG")
    df_rag.loc[i, "kg_ner"] = prettychain(query, kgNER_chain)
    print(">>> vector-graph combined")
    df_rag.loc[i, "vec_kg_combined"] = prettychain(query, finalCombined_chain)
    print("\n")
    time.sleep(60)

0 What was Nvidia’s total revenue in FY 2024?
>>> kg ner RAG
names=['Nvidia', 'total revenue', 'FY 2024']
    I don't have the exact total revenue for Nvidia in FY 2024, as the context provided does not include any financial
information about Nvidia's revenue. You would need to refer to Nvidia's official financial statements or reports for
accurate revenue information.
>>> vector-graph combined
names=['Nvidia', 'total revenue', 'FY 2024']
    - Nvidia's total revenue in FY 2024 was $60,922 million.  Explanation: The total revenue for Nvidia in FY 2024 can be
found in the 'vecRAG_doc2graph' document, specifically in the 'page_content' of the second document. The revenue is
listed under the 'Revenue: (In millions)' header for the year ended January 28, 2024.


1 How much did Nvidia spend on Research and Development in FY 2024?
>>> kg ner RAG
names=['Nvidia', 'Research and Development', 'FY 2024']
    I don't have enough information to provide an accurate answer to your question. The prov

In [38]:
df_rag

Unnamed: 0,query,level,vec_doc2graph,vec_form10k,vec_combined,kg_ner,kg_2cypher,kg_combined,vec_kg_combined
0,What was Nvidia’s total revenue in FY 2024?,basic,"Nvidia's total revenue for FY 2024 was $60,922...","Nvidia's total revenue in FY 2024 was $29,830 ...","- Nvidia's total revenue in FY 2024 was $60,92...",I don't have the exact total revenue for Nvidi...,,,"- Nvidia's total revenue in FY 2024 was $60,92..."
1,How much did Nvidia spend on Research and Deve...,basic,Nvidia did not disclose the amount they spent ...,Nvidia did not disclose the amount spent on Re...,- Nvidia spent $102 million on Research and De...,I don't have enough information to provide an ...,,,"- Nvidia spent $11,329 million on operating ex..."
2,What is the name of Nvidia’s latest GPU archit...,basic,Nvidia's latest GPU architecture is called Hop...,Nvidia's latest GPU architecture is not explic...,The name of Nvidia's latest GPU architecture i...,"Based on the provided context, there is no dir...",,,I don't have real-time access to external data...
3,Who are Nvidia’s major customers in the Data C...,basic,NVIDIA's major customers in the Data Center se...,Nvidia's major customers in the Data Center se...,- Major cloud providers and server makers\n- O...,- Nvidia Corporation\n\nExplanation:\nThe cont...,,,- The major customers in Nvidia’s Data Center ...
4,What are the primary markets Nvidia serves?,basic,"NVIDIA serves various markets, including the g...",Nvidia serves the following primary markets: D...,- Data Center: NVIDIA's data center platforms ...,"Based on the provided context, there is no dir...",,,- Nvidia serves four primary markets: Data Cen...
5,Describe the impact of the U.S. export restric...,intermediate,Export restrictions on Nvidia's sales to China...,The U.S. export restrictions on Nvidia's sales...,- The U.S. export restrictions on Nvidia's sal...,"Based on the provided context, the text does n...",,,- The U.S. export restrictions on certain gami...
6,How has Nvidia’s Gaming revenue changed in FY ...,intermediate,Nvidia's Gaming revenue has not changed signif...,Nvidia's Gaming revenue for FY 2024 was not pr...,- Nvidia's Gaming revenue increased in FY 2024...,"Based on the provided context, there is not en...",,,"Based on the provided context, there is not en..."
7,What are the key products in Nvidia’s Professi...,intermediate,The key products in Nvidia’s Professional Visu...,The key products in Nvidia’s Professional Visu...,- Quadro/NVIDIA RTX GPUs for enterprise workst...,- Nvidia VGPU\n\nExplanation:\nBased on the pr...,,,- Quadro/NVIDIA RTX GPUs for enterprise workst...
8,Explain Nvidia’s strategy for mitigating the i...,intermediate,Nvidia's strategy for mitigating the impact of...,Nvidia's strategy for mitigating the impact of...,"- Nvidia has a platform strategy, which involv...",- Nvidia has adopted a multi-faceted strategy ...,,,- Nvidia has a strategy of paying premiums and...
9,What new products did Nvidia introduce in FY 2...,intermediate,Nvidia did not introduce any new products in F...,"Nvidia introduced new products in FY 2024, inc...",- NVIDIA DGX Cloud services\n- NVIDIA AI Found...,"Based on the provided context, there is no inf...",,,- NVIDIA DGX Cloud services\n- NVIDIA AI Found...


In [44]:
df_rag.copy().to_csv("Nvidia_form10k_df_rag_groq_mixtral_QAchain_sample3.csv", index=False)

In [50]:
df_rag.to_csv("sample3.csv")