* https://pypi.org/project/sec-api/#description
* https://sec-api.io/profile

In [2]:
# SEC API
from sec_api import QueryApi, XbrlApi, ExtractorApi, MappingApi
# environment variables
import os
from dotenv import load_dotenv
# data processing
import json
import textwrap
from bs4 import BeautifulSoup
import pandas as pd

from tqdm import tqdm

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
# from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq

load_dotenv()

True

## 1. Get Form 10K - run just ONCE!!

#### Get 10K url

In [2]:
queryApi = QueryApi(os.getenv('SEC_API_KEY2'))

In [3]:
ticker = "NVDA"
query = {
  "query": { "query_string": { 
      "query": f"formType:\"10-K\" AND ticker:{ticker}", # only 10-Ks
  }},
  "from": "0", # start returning matches from position null, i.e. the first matching filing 
  "size": "1"  # return just one filing
}

In [4]:
response = queryApi.get_filings(query)

In [5]:
response

{'total': {'value': 26, 'relation': 'eq'},
 'query': {'from': 0, 'size': 1},
 'filings': [{'id': '946a233fb0d9d28c95b372bdda12d004',
   'accessionNo': '0001045810-24-000029',
   'cik': '1045810',
   'ticker': 'NVDA',
   'companyName': 'NVIDIA CORP',
   'companyNameLong': 'NVIDIA CORP (Filer)',
   'formType': '10-K',
   'description': 'Form 10-K - Annual report [Section 13 and 15(d), not S-K Item 405]',
   'filedAt': '2024-02-21T16:36:57-05:00',
   'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/0001045810-24-000029.txt',
   'linkToHtml': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/0001045810-24-000029-index.htm',
   'linkToXbrl': '',
   'linkToFilingDetails': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/nvda-20240128.htm',
   'entities': [{'companyName': 'NVIDIA CORP (Filer)',
     'cik': '1045810',
     'irsNo': '943177549',
     'stateOfIncorporation': 'DE',
     'fiscalYearEnd': '0128',
     'type': '10-

In [6]:
response["total"]

{'value': 26, 'relation': 'eq'}

In [7]:
filings = response["filings"][0]
filings

{'id': '946a233fb0d9d28c95b372bdda12d004',
 'accessionNo': '0001045810-24-000029',
 'cik': '1045810',
 'ticker': 'NVDA',
 'companyName': 'NVIDIA CORP',
 'companyNameLong': 'NVIDIA CORP (Filer)',
 'formType': '10-K',
 'description': 'Form 10-K - Annual report [Section 13 and 15(d), not S-K Item 405]',
 'filedAt': '2024-02-21T16:36:57-05:00',
 'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/0001045810-24-000029.txt',
 'linkToHtml': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/0001045810-24-000029-index.htm',
 'linkToXbrl': '',
 'linkToFilingDetails': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/nvda-20240128.htm',
 'entities': [{'companyName': 'NVIDIA CORP (Filer)',
   'cik': '1045810',
   'irsNo': '943177549',
   'stateOfIncorporation': 'DE',
   'fiscalYearEnd': '0128',
   'type': '10-K',
   'act': '34',
   'fileNo': '000-23985',
   'filmNo': '24660316',
   'sic': '3674 Semiconductors &amp; Related Devices'}]

In [8]:
url_10k = filings["linkToFilingDetails"]
url_10k

'https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/nvda-20240128.htm'

#### Get 10K financial stateatements

In [9]:
xbrlApi = XbrlApi(os.getenv('SEC_API_KEY2'))
financial_statements_json = xbrlApi.xbrl_to_json(url_10k)

In [10]:
type(financial_statements_json)

dict

In [11]:
for k,v in financial_statements_json.items():
    print(k, type(v))

CoverPage <class 'dict'>
AuditInformation <class 'dict'>
StatementsOfIncome <class 'dict'>
StatementsOfComprehensiveIncome <class 'dict'>
BalanceSheets <class 'dict'>
BalanceSheetsParenthetical <class 'dict'>
StatementsOfShareholdersEquity <class 'dict'>
StatementsOfShareholdersEquityParenthetical <class 'dict'>
StatementsOfCashFlows <class 'dict'>
OrganizationandSummaryofSignificantAccountingPolicies <class 'dict'>
BusinessCombination <class 'dict'>
Leases <class 'dict'>
StockBasedCompensation <class 'dict'>
NetIncomePerShare <class 'dict'>
Goodwill <class 'dict'>
AmortizableIntangibleAssets <class 'dict'>
CashEquivalentsandMarketableSecurities <class 'dict'>
FairValueofFinancialAssetsandLiabilitiesandInvestmentsinNonAffiliatedEntities <class 'dict'>
BalanceSheetComponents <class 'dict'>
DerivativeFinancialInstruments <class 'dict'>
Debt <class 'dict'>
CommitmentsandContingencies <class 'dict'>
IncomeTaxes <class 'dict'>
ShareholdersEquity <class 'dict'>
EmployeeRetirementPlans <class

In [12]:
# # Access income statement, balance sheet and cash flow statement
# print(financial_statements_json["StatementsOfIncome"])
# print(financial_statements_json["BalanceSheets"])
# print(financial_statements_json["StatementsOfCashFlows"])

In [13]:
with open("nvda_10k_2023_fin.json", "w") as f:
    json.dump(financial_statements_json, f, indent=4)

##### Standard 10K ToC text
"message":"10-K item type not supported. Supported items are: {1, 1A, 1B, 1C, 2, 3, 4, 5, 6, 7, 7A, 8, 9, 9A, 9B, 9C, 10, 11, 12, 13, 14, 15"}

In [14]:
# Table of contents
toc_key = ["1", "1A", "1B", "1C", "2", "3", "4", "5", "6", "7", "7A", "8", 
           "9", "9A", "9B", "9C", "10", "11", "12", "13", "14", "15"]
toc_val = [str('item'+toc_key[x]) for x in range(len(toc_key))]
toc_sectors = dict(zip(toc_key, toc_val))
len(toc_sectors), 


(22,)

In [15]:
extractorApi = ExtractorApi(os.getenv('SEC_API_KEY2'))
sectors = {}

# generate the toc section key value pairs
for k, v in tqdm(toc_sectors.items()):
    # print(k, v)
    sector = extractorApi.get_section(url_10k, k, return_type='text')
    sectors.update({v: sector})
    

  0%|          | 0/22 [00:00<?, ?it/s]

100%|██████████| 22/22 [00:28<00:00,  1.27s/it]


In [16]:
type(sectors)

dict

##### Get Company mapping (e.g. cik, sic)

In [17]:
mappingApi = MappingApi(os.getenv('SEC_API_KEY2'))

In [18]:
mapping = mappingApi.resolve("ticker", ticker)[0]

In [19]:
# type(mapping), 
mapping

{'name': 'NVIDIA CORP',
 'ticker': 'NVDA',
 'cik': '1045810',
 'cusip': '67066G104',
 'exchange': 'NASDAQ',
 'isDelisted': False,
 'category': 'Domestic Common Stock',
 'sector': 'Technology',
 'industry': 'Semiconductors',
 'sic': '3674',
 'sicSector': 'Manufacturing',
 'sicIndustry': 'Semiconductors & Related Devices',
 'famaSector': '',
 'famaIndustry': 'Electronic Equipment',
 'currency': 'USD',
 'location': 'California; U.S.A',
 'id': '4a73b69083f93d38e05e0b76219875c9'}

##### Save to file

In [20]:
nvda_10k_text_json = {}

nvda_10k_text_json.update(mapping)
nvda_10k_text_json.update({'url': url_10k,})
nvda_10k_text_json.update(sectors)

In [21]:
for k,v in nvda_10k_text_json.items():
    print(k, type(v))

name <class 'str'>
ticker <class 'str'>
cik <class 'str'>
cusip <class 'str'>
exchange <class 'str'>
isDelisted <class 'bool'>
category <class 'str'>
sector <class 'str'>
industry <class 'str'>
sic <class 'str'>
sicSector <class 'str'>
sicIndustry <class 'str'>
famaSector <class 'str'>
famaIndustry <class 'str'>
currency <class 'str'>
location <class 'str'>
id <class 'str'>
url <class 'str'>
item1 <class 'str'>
item1A <class 'str'>
item1B <class 'str'>
item1C <class 'str'>
item2 <class 'str'>
item3 <class 'str'>
item4 <class 'str'>
item5 <class 'str'>
item6 <class 'str'>
item7 <class 'str'>
item7A <class 'str'>
item8 <class 'str'>
item9 <class 'str'>
item9A <class 'str'>
item9B <class 'str'>
item9C <class 'str'>
item10 <class 'str'>
item11 <class 'str'>
item12 <class 'str'>
item13 <class 'str'>
item14 <class 'str'>
item15 <class 'str'>


In [22]:
with open("nvda_10k_2023.json", "w") as f:
    json.dump(nvda_10k_text_json, f, indent=4)

## 2. Check financial statements
https://medium.com/@jan_5421/extracting-financial-statements-from-sec-filings-xbrl-to-json-f83542ade90

In [23]:
with open("nvda_10k_2023_fin.json", "r") as f:
    financial_statements_json = json.load(f)

##### Income statement

In [24]:
# convert XBRL-JSON of income statement to pandas dataframe
def get_income_statement(financial_statements_json):
    income_statement_store = {}

    # iterate over each US GAAP item in the income statement
    for usGaapItem in financial_statements_json['StatementsOfIncome']:
        values = []
        indicies = []

        for fact in financial_statements_json['StatementsOfIncome'][usGaapItem]:
            # only consider items without segment. not required for our analysis.
            if 'segment' not in fact:
                index = fact['period']['startDate'] + '-' + fact['period']['endDate']
                # ensure no index duplicates are created
                if index not in indicies:
                    values.append(fact['value'])
                    indicies.append(index)                    

        income_statement_store[usGaapItem] = pd.Series(values, index=indicies) 

    income_statement = pd.DataFrame(income_statement_store)
    # switch columns and rows so that US GAAP items are rows and each column header represents a date range
    return income_statement.T 

income_statement = get_income_statement(financial_statements_json)

income_statement

Unnamed: 0,2023-01-30-2024-01-28,2022-01-31-2023-01-29,2021-02-01-2022-01-30
Revenues,60922000000.0,26974000000.0,26914000000.0
CostOfRevenue,16621000000.0,11618000000.0,9439000000.0
GrossProfit,44301000000.0,15356000000.0,17475000000.0
ResearchAndDevelopmentExpense,8675000000.0,7339000000.0,5268000000.0
SellingGeneralAndAdministrativeExpense,2654000000.0,2440000000.0,2166000000.0
BusinessCombinationAdvancedConsiderationWrittenOff,0.0,1353000000.0,0.0
OperatingExpenses,11329000000.0,11132000000.0,7434000000.0
OperatingIncomeLoss,32972000000.0,4224000000.0,10041000000.0
InvestmentIncomeInterest,866000000.0,267000000.0,29000000.0
InterestExpense,257000000.0,262000000.0,236000000.0


##### Balance sheet

In [25]:
# convert XBRL-JSON of balance sheet to pandas dataframe
def get_balance_sheet(financial_statements_json):
    balance_sheet_store = {}

    for usGaapItem in financial_statements_json['BalanceSheets']:
        values = []
        indicies = []

        for fact in financial_statements_json['BalanceSheets'][usGaapItem]:
            # only consider items without segment.
            if 'segment' not in fact:
                index = fact['period']['instant']

                # avoid duplicate indicies with same values
                if index in indicies:
                    continue
                    
                # add 0 if value is nil
                if "value" not in fact:
                    values.append(0)
                else:
                    values.append(fact['value'])

                indicies.append(index)                    

            balance_sheet_store[usGaapItem] = pd.Series(values, index=indicies) 

    balance_sheet = pd.DataFrame(balance_sheet_store)
    # switch columns and rows so that US GAAP items are rows and each column header represents a date instant
    return balance_sheet.T

balance_sheet = get_balance_sheet(financial_statements_json)
balance_sheet

Unnamed: 0,2021-01-31,2022-01-30,2023-01-29,2024-01-28
CashAndCashEquivalentsAtCarryingValue,,,3389000000.0,7280000000.0
MarketableSecuritiesCurrent,,,9907000000.0,18704000000.0
AccountsReceivableNetCurrent,,,3827000000.0,9999000000.0
InventoryNet,,,5159000000.0,5282000000.0
PrepaidExpenseAndOtherAssetsCurrent,,,791000000.0,3080000000.0
AssetsCurrent,,,23073000000.0,44345000000.0
PropertyPlantAndEquipmentNet,,,3807000000.0,3914000000.0
OperatingLeaseRightOfUseAsset,,,1038000000.0,1346000000.0
Goodwill,,,4372000000.0,4430000000.0
IntangibleAssetsNetExcludingGoodwill,,,1676000000.0,1112000000.0


##### Cash flow statement

In [26]:
def get_cash_flow_statement(financial_statements_json):
    cash_flows_store = {}

    for usGaapItem in financial_statements_json['StatementsOfCashFlows']:
        values = []
        indicies = []

        for fact in financial_statements_json['StatementsOfCashFlows'][usGaapItem]:        
            # only consider items without segment.
            if 'segment' not in fact:
                # check if date instant or date range is present
                if "instant" in fact['period']:
                    index = fact['period']['instant']
                else:
                    index = fact['period']['startDate'] + '-' + fact['period']['endDate']

                # avoid duplicate indicies with same values
                if index in indicies:
                    continue

                if "value" not in fact:
                    values.append(0)
                else:
                    values.append(fact['value'])

                indicies.append(index)                    

        cash_flows_store[usGaapItem] = pd.Series(values, index=indicies) 


    cash_flows = pd.DataFrame(cash_flows_store)
    return cash_flows.T
    
cash_flows = get_cash_flow_statement(financial_statements_json)
cash_flows

Unnamed: 0,2021-01-31,2021-02-01-2022-01-30,2022-01-30,2022-01-31-2023-01-29,2023-01-29,2023-01-30-2024-01-28,2024-01-28
NetIncomeLoss,,9752000000.0,,4368000000.0,,29760000000.0,
ShareBasedCompensation,,2004000000.0,,2709000000.0,,3549000000.0,
DepreciationDepletionAndAmortization,,1174000000.0,,1544000000.0,,1508000000.0,
DeferredIncomeTaxExpenseBenefit,,-406000000.0,,-2164000000.0,,-2489000000.0,
GainLossOnInvestments,,100000000.0,,-45000000.0,,238000000.0,
BusinessCombinationAdvancedConsiderationWrittenOff,,0.0,,1353000000.0,,0.0,
OtherNoncashIncomeExpense,,-47000000.0,,7000000.0,,278000000.0,
IncreaseDecreaseInAccountsReceivable,,2215000000.0,,-822000000.0,,6172000000.0,
IncreaseDecreaseInInventories,,774000000.0,,2554000000.0,,98000000.0,
IncreaseDecreaseInPrepaidDeferredExpenseAndOtherAssets,,1715000000.0,,1517000000.0,,1522000000.0,


## 3. Generate graph nodes from 10K text

In [3]:
os.environ["NEO4J_URL"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "neo4j123456"
os.environ["NEO4J_DATABASE"] = "form10k"

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

In [28]:
with open("nvda_10k_2023.json", "r") as f:
    filing_10k = json.load(f)

In [29]:
filing_10k.keys()

dict_keys(['name', 'ticker', 'cik', 'cusip', 'exchange', 'isDelisted', 'category', 'sector', 'industry', 'sic', 'sicSector', 'sicIndustry', 'famaSector', 'famaIndustry', 'currency', 'location', 'id', 'url', 'item1', 'item1A', 'item1B', 'item1C', 'item2', 'item3', 'item4', 'item5', 'item6', 'item7', 'item7A', 'item8', 'item9', 'item9A', 'item9B', 'item9C', 'item10', 'item11', 'item12', 'item13', 'item14', 'item15'])

##### Split text to chunks

In [30]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

**Text embedding**

In [9]:
embeddings = OllamaEmbeddings(
    model="mxbai-embed-large:latest",  # 1024 dimensions
    # model="znbang/bge:large-en-v1.5-f16",   # 1024 dimensions
    # model="nomic-embed-text:latest",   # 768 dimensions
    num_gpu=1,
    num_thread=16,
)

embedded = embeddings.embed_query("some sample text chunk")

In [32]:
type(embedded), len(embedded)

(list, 1024)

In [33]:
extract_section_list = ['item1','item1A','item7','item7A', 'item15']

In [34]:
def split_form10k_data_from_file(file_as_object):
    chunks_with_metadata = [] # use this to accumlate chunk records
    for item in extract_section_list: # pull these keys from the json
        print(f'Processing {item}') 
        item_text = file_as_object[item] # grab the text of the item
        item_text_chunks = text_splitter.split_text(item_text) # split the text into chunks
        chunk_seq_id = 0
        for chunk in item_text_chunks: 
            form_id = file_as_object['id']
            # finally, construct a record with metadata and the chunk text
            chunks_with_metadata.append({
                'text': chunk, 
                # metadata from looping...
                'f10kItem': item,
                'chunkSeqId': chunk_seq_id,
                # constructed metadata...
                'formId': f'{form_id}', # pulled from the filename
                'chunkId': f'{form_id}-{item}-chunk{chunk_seq_id:04d}',
                # metadata from file...
                'name': file_as_object['name'],
                'cik': file_as_object['cik'],
                'cusip': file_as_object['cusip'],
                'source': file_as_object['url'],
                'textEmbedding': embeddings.embed_query(chunk)
            })
            chunk_seq_id += 1
        print(f'\tSplit into {chunk_seq_id} chunks')
    return chunks_with_metadata

In [35]:
filing_10k_chunks = split_form10k_data_from_file(filing_10k)
print(f'Total {len(filing_10k_chunks)} chunks')

Processing item1
	Split into 32 chunks
Processing item1A
	Split into 75 chunks
Processing item7
	Split into 25 chunks
Processing item7A
	Split into 2 chunks
Processing item15
	Split into 72 chunks
Total 206 chunks


In [36]:
filing_10k_chunks[22]

{'text': 'Compliance with laws, rules, and regulations has not otherwise had a material effect upon our capital expenditures, results of operations, or competitive position and we do not currently anticipate material capital expenditures for environmental control facilities. Compliance with existing or future governmental regulations, including, but not limited to, those pertaining to IP ownership and infringement, taxes, import and export requirements and tariffs, anti-corruption, business acquisitions, foreign exchange controls and cash repatriation restrictions, data privacy requirements, competition and antitrust, advertising, employment, product regulations, cybersecurity, environmental, health and safety requirements, the responsible use of AI, climate change, cryptocurrency, and consumer laws, could increase our costs, impact our competitive position, and otherwise may have a material adverse impact on our business, financial condition and results of operations in subsequent per

### 3.1 Create graph nodes using text chunks

In [4]:
kg = Neo4jGraph(
    url=os.getenv("NEO4J_URL"), 
    username=os.getenv("NEO4J_USERNAME"), 
    password=os.getenv("NEO4J_PASSWORD"), 
    database=os.getenv("NEO4J_DATABASE"),
)

In [5]:
kg.schema

'Node properties:\nChunk {textEmbedding: LIST, name: STRING, cusip: STRING, f10kItem: STRING, chunkSeqId: INTEGER, text: STRING, formId: STRING, chunkId: STRING, cik: STRING, source: STRING}\nForm {formId: STRING, cusip: STRING, cik: STRING, source: STRING, name: STRING}\nRelationship properties:\nSECTION {f10kItem: STRING}\nThe relationships:\n(:Chunk)-[:NEXT]->(:Chunk)\n(:Form)-[:SECTION]->(:Chunk)'

In [38]:
# test purpose, delete existing data to reset
if kg.query("match p=()-->() return count(p) as isExist")[0]['isExist'] > 0:
    kg.query(
        """
        MATCH p=()-->()
        DETACH DELETE p
        """
    )
    print(kg.query("match p=()-->() return count(p) as isExist")[0]['isExist'])

# delete existing indexes
kg.query("CALL apoc.schema.assert({},{},true) YIELD label, key RETURN *")
# kg.query("DROP INDEX vector")    # use this cypher in neo4j browser to delete index one by one

[]

**Create a uniqueness constraint to avoid duplicate chunks**

In [39]:
index_name = "unique_chunk"

kg.query(f"""
CREATE CONSTRAINT {index_name} IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")

[]

In [40]:
# check if constraint/index created successfully

kg.query(f"""
SHOW INDEXes yield * 
where name CONTAINS "{index_name}"
return name, createStatement
""")

[{'name': 'unique_chunk',
  'createStatement': 'CREATE CONSTRAINT `unique_chunk` FOR (n:`Chunk`) REQUIRE (n.`chunkId`) IS UNIQUE'}]

In [41]:
kg.query("SHOW CONSTRAINT")

[{'id': 4,
  'name': 'unique_chunk',
  'type': 'UNIQUENESS',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['chunkId'],
  'ownedIndex': 'unique_chunk',
  'propertyType': None}]

**Create nodes for chunks**

In [42]:
# test purpose, delete existing data to reset
if kg.query("match (n) return count(n) as isExist")[0]['isExist'] > 0:
    n_nodes = kg.query('match (n) return count(n) as isExist')[0]['isExist']
    print(f"{n_nodes} nodes exist.")
    print("Deleting existing nodes...")
    kg.query(
        """
        MATCH (n)
        DETACH DELETE n
        """
    )
    print("All nodes deleted.")

**Calculate embedding vectors for chunks and populate index**
- This query calculates the embedding vector and stores it as a property called `textEmbedding` on each `Chunk` node.

In [43]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.name = $chunkParam.name,
        mergedChunk.formId = $chunkParam.formId, 
        mergedChunk.cik = $chunkParam.cik, 
        mergedChunk.cusip = $chunkParam.cusip, 
        mergedChunk.source = $chunkParam.source, 
        mergedChunk.f10kItem = $chunkParam.f10kItem, 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text

    WITH mergedChunk, $chunkParam.textEmbedding as vector
    CALL db.create.setNodeVectorProperty(mergedChunk, "textEmbedding", vector)        
RETURN mergedChunk
"""

In [44]:
node_count = 0
for chunk in filing_10k_chunks:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
    kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

Creating `:Chunk` node for chunk ID 4a73b69083f93d38e05e0b76219875c9-item1-chunk0000
Creating `:Chunk` node for chunk ID 4a73b69083f93d38e05e0b76219875c9-item1-chunk0001
Creating `:Chunk` node for chunk ID 4a73b69083f93d38e05e0b76219875c9-item1-chunk0002
Creating `:Chunk` node for chunk ID 4a73b69083f93d38e05e0b76219875c9-item1-chunk0003
Creating `:Chunk` node for chunk ID 4a73b69083f93d38e05e0b76219875c9-item1-chunk0004
Creating `:Chunk` node for chunk ID 4a73b69083f93d38e05e0b76219875c9-item1-chunk0005
Creating `:Chunk` node for chunk ID 4a73b69083f93d38e05e0b76219875c9-item1-chunk0006
Creating `:Chunk` node for chunk ID 4a73b69083f93d38e05e0b76219875c9-item1-chunk0007
Creating `:Chunk` node for chunk ID 4a73b69083f93d38e05e0b76219875c9-item1-chunk0008
Creating `:Chunk` node for chunk ID 4a73b69083f93d38e05e0b76219875c9-item1-chunk0009
Creating `:Chunk` node for chunk ID 4a73b69083f93d38e05e0b76219875c9-item1-chunk0010
Creating `:Chunk` node for chunk ID 4a73b69083f93d38e05e0b7621987

### 3.2 Create vector index on textEmbedding

In [45]:
dimensions = len(embedded)    # nomic 768, mxbai 1024, openai 1536

kg.query("""
         CREATE VECTOR INDEX $VECTOR_INDEX_NAME IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: $dimensions,
            `vector.similarity_function`: 'cosine'    
         }}
""", params = {
    "VECTOR_INDEX_NAME": VECTOR_INDEX_NAME, 
    "dimensions": dimensions})

[]

In [46]:
kg.query("""
SHOW INDEXES yield * 
where name CONTAINS $VECTOR_INDEX_NAME
return name, createStatement
""", params={"VECTOR_INDEX_NAME": VECTOR_INDEX_NAME, })

[{'name': 'form_10k_chunks',
  'createStatement': "CREATE VECTOR INDEX `form_10k_chunks` FOR (n:`Chunk`) ON (n.`textEmbedding`) OPTIONS {indexConfig: {`vector.dimensions`: 1024,`vector.similarity_function`: 'COSINE'}, indexProvider: 'vector-2.0'}"}]

In [47]:
kg.query("SHOW VECTOR INDEXES")

[{'id': 6,
  'name': 'form_10k_chunks',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': None}]

In [6]:
kg.refresh_schema()
print(kg.schema)

Node properties:
Chunk {textEmbedding: LIST, name: STRING, cusip: STRING, f10kItem: STRING, chunkSeqId: INTEGER, text: STRING, formId: STRING, chunkId: STRING, cik: STRING, source: STRING}
Form {formId: STRING, cusip: STRING, cik: STRING, source: STRING, name: STRING}
Relationship properties:
SECTION {f10kItem: STRING}
The relationships:
(:Chunk)-[:NEXT]->(:Chunk)
(:Form)-[:SECTION]->(:Chunk)


**Use similarity search to find relevant chunks**
- Setup a help function to perform similarity search using the vector index

In [7]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    CALL db.index.vector.queryNodes($index_name, $top_k, $question_embedding) yield node, score
    RETURN score, node.chunkId as chunkId, node.text AS text
  """
  similar = kg.query(vector_search_query, 
                     params={ 
                      'index_name':VECTOR_INDEX_NAME, 
                      'top_k': 10,
                      'question_embedding': embeddings.embed_query(question)
                      })
  return similar

In [50]:
search_results = neo4j_vector_search(
    'In a single sentence, tell me about Nvidia gross margin.'
)

In [51]:
search_results[:2]

[{'score': 0.8009248375892639,
  'chunkId': '4a73b69083f93d38e05e0b76219875c9-item15-chunk0052',
  'text': 'case, which NVIDIA had filed on October 10, 2023. On November 21, 2023, NVIDIA filed a motion with the Ninth Circuit for a stay of the mandate pending NVIDIA&#8217;s petition for a writ of certiorari in the Supreme Court of the United States and the Supreme Court&#8217;s'},
 {'score': 0.7841192483901978,
  'chunkId': '4a73b69083f93d38e05e0b76219875c9-item7-chunk0000',
  'text': 'Item 7. Management\'s Discussion and Analysis of Financial Condition and Results of Operations \n\nThe following discussion and analysis of our financial condition and results of operations should be read in conjunction with &#8220;Item 1A. Risk Factors&#8221;, our Consolidated Financial Statements and related Notes thereto, as well as other cautionary statements and risks described elsewhere in this Annual Report on Form 10-K, before deciding to purchase, hold or sell shares of our common stock. \n\nOver

##### RAG chat with the form

In [10]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=embeddings,
    url=os.getenv("NEO4J_URL"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
    database=os.getenv("NEO4J_DATABASE"),
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)

In [11]:
retriever = neo4j_vector_store.as_retriever()

* Set up a RetrievalQAWithSourcesChain to carry out question answering

In [12]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatGroq(temperature=0), 
    chain_type="stuff", 
    retriever=retriever
)

In [13]:
def prettychain(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain.invoke({
        "system" : "If you don't know the answer, just say you don't know.", 
        "question": question},
        return_only_outputs=True,)
    print(textwrap.fill(response['answer'], 60))

In [14]:
prettychain("Where is Nvidia headquartered?")

Nvidia is headquartered in Santa Clara, California.


In [57]:
prettychain("What is Nvidia primary business?")

Nvidia's primary business is pioneering accelerated
computing to help solve complex computational problems in
several large and important computationally intensive
fields. They have leveraged their GPU architecture to create
platforms for accelerated computing, AI solutions,
scientific computing, data science, AV, robotics, metaverse
and 3D internet applications. Their two main operating
segments are "Compute & Networking" and "Graphics."


In [58]:
prettychain("Nvidia's gross margin in 2023")

There is no information provided in the sources that allows
me to determine Nvidia's gross margin in 2023.


## 4. Adding Relationships to graph of 10k

**Create a Form 10-K node**
- Create a node to represent the entire Form 10-K
- Populate with metadata taken from a single chunk of the form

In [59]:
cypher = """
  MATCH (anyChunk:Chunk) 
  WITH anyChunk LIMIT 1
  RETURN anyChunk { .name, .source, .formId, .cik, .cusip } as formInfo
"""
form_info_list = kg.query(cypher)

form_info_list

[{'formInfo': {'cik': '1045810',
   'source': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/nvda-20240128.htm',
   'formId': '4a73b69083f93d38e05e0b76219875c9',
   'name': 'NVIDIA CORP',
   'cusip': '67066G104'}}]

In [60]:
form_info = form_info_list[0]['formInfo']
form_info

{'cik': '1045810',
 'source': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581024000029/nvda-20240128.htm',
 'formId': '4a73b69083f93d38e05e0b76219875c9',
 'name': 'NVIDIA CORP',
 'cusip': '67066G104'}

In [61]:
cypher = """
    MERGE (f:Form {formId: $formInfoParam.formId })
      ON CREATE 
        SET f.name = $formInfoParam.name
        SET f.source = $formInfoParam.source
        SET f.cik = $formInfoParam.cik
        SET f.cusip = $formInfoParam.cusip
"""

kg.query(cypher, params={'formInfoParam': form_info})

[]

In [62]:
kg.query("MATCH (f:Form) RETURN count(f) as formCount")

[{'formCount': 1}]

**Create a linked list of Chunk nodes for each section(e.g. item1, item1A)**
- Start by identifying chunks from the same section

In [63]:
cypher = """
  MATCH (from_same_form:Chunk)
    WHERE from_same_form.formId = $formIdParam
  RETURN from_same_form {.formId, .f10kItem, .chunkId, .chunkSeqId } as chunkInfo
"""

kg.query(cypher, params={'formIdParam': form_info['formId']})

[{'chunkInfo': {'formId': '4a73b69083f93d38e05e0b76219875c9',
   'f10kItem': 'item1',
   'chunkId': '4a73b69083f93d38e05e0b76219875c9-item1-chunk0001',
   'chunkSeqId': 1}},
 {'chunkInfo': {'formId': '4a73b69083f93d38e05e0b76219875c9',
   'f10kItem': 'item1',
   'chunkId': '4a73b69083f93d38e05e0b76219875c9-item1-chunk0002',
   'chunkSeqId': 2}},
 {'chunkInfo': {'formId': '4a73b69083f93d38e05e0b76219875c9',
   'f10kItem': 'item1',
   'chunkId': '4a73b69083f93d38e05e0b76219875c9-item1-chunk0003',
   'chunkSeqId': 3}},
 {'chunkInfo': {'formId': '4a73b69083f93d38e05e0b76219875c9',
   'f10kItem': 'item1',
   'chunkId': '4a73b69083f93d38e05e0b76219875c9-item1-chunk0004',
   'chunkSeqId': 4}},
 {'chunkInfo': {'formId': '4a73b69083f93d38e05e0b76219875c9',
   'f10kItem': 'item1',
   'chunkId': '4a73b69083f93d38e05e0b76219875c9-item1-chunk0005',
   'chunkSeqId': 5}},
 {'chunkInfo': {'formId': '4a73b69083f93d38e05e0b76219875c9',
   'f10kItem': 'item1',
   'chunkId': '4a73b69083f93d38e05e0b7621987

**Add a NEXT relationship between subsequent chunks within each sector**
- Use the apoc.nodes.link function from Neo4j to link ordered list of Chunk nodes with a NEXT relationship
- Loop thru all sections 

In [64]:
cypher = """
  MATCH (from_same_section:Chunk)
  WHERE from_same_section.formId = $formIdParam
    AND from_same_section.f10kItem = $f10kItemParam
  WITH from_same_section
    ORDER BY from_same_section.chunkSeqId ASC
  WITH collect(from_same_section) as section_chunk_list
    CALL apoc.nodes.link(
        section_chunk_list, 
        "NEXT", 
        {avoidDuplicates: true}
    )  
  RETURN size(section_chunk_list)
"""

for section in extract_section_list:
    kg.query(cypher, params={'formIdParam': form_info['formId'], 
                         'f10kItemParam': section})

**Create a SECTION relationship on first chunk of each section**

In [65]:
cypher = """
  MATCH (first:Chunk), (f:Form)
  WHERE first.formId = f.formId
    AND first.chunkSeqId = 0
  WITH first, f
    MERGE (f)-[r:SECTION {f10kItem: first.f10kItem}]->(first)
  RETURN count(r)
"""

kg.query(cypher)

[{'count(r)': 5}]

In [66]:
kg.refresh_schema()
print(kg.schema)

Node properties:
Chunk {textEmbedding: LIST, name: STRING, cusip: STRING, f10kItem: STRING, chunkSeqId: INTEGER, text: STRING, formId: STRING, chunkId: STRING, cik: STRING, source: STRING}
Form {formId: STRING, cusip: STRING, cik: STRING, source: STRING, name: STRING}
Relationship properties:
SECTION {f10kItem: STRING}
The relationships:
(:Chunk)-[:NEXT]->(:Chunk)
(:Form)-[:SECTION]->(:Chunk)


**check if the relationships are working**
- Return the first chunk of the Item 1 section

In [67]:
cypher = """
  MATCH (f:Form)-[r:SECTION]->(first:Chunk)
    WHERE f.formId = $formIdParam
        AND r.f10kItem = $f10kItemParam
  RETURN first.chunkId as chunkId, first.text as text
"""

first_chunk_info = kg.query(cypher, params={
    'formIdParam': form_info['formId'], 
    'f10kItemParam': 'item7'
})[0]

first_chunk_info

{'chunkId': '4a73b69083f93d38e05e0b76219875c9-item7-chunk0000',
 'text': 'Item 7. Management\'s Discussion and Analysis of Financial Condition and Results of Operations \n\nThe following discussion and analysis of our financial condition and results of operations should be read in conjunction with &#8220;Item 1A. Risk Factors&#8221;, our Consolidated Financial Statements and related Notes thereto, as well as other cautionary statements and risks described elsewhere in this Annual Report on Form 10-K, before deciding to purchase, hold or sell shares of our common stock. \n\nOverview \n\nOur Company and Our Businesses \n\nNVIDIA pioneered accelerated computing to help solve the most challenging computational problems. Since our original focus on PC graphics, we have expanded to several other large and important computationally intensive fields. NVIDIA has leveraged its GPU architecture to create platforms for accelerated computing, AI solutions, scientific computing, data science, AV, ro

In [68]:
cypher = """
  MATCH (first:Chunk)-[:NEXT]->(nextChunk:Chunk)
    WHERE first.chunkId = $chunkIdParam
  RETURN nextChunk.chunkId as chunkId, nextChunk.text as text
"""

next_chunk_info = kg.query(cypher, params={
    'chunkIdParam': first_chunk_info['chunkId']
})[0]

next_chunk_info

{'chunkId': '4a73b69083f93d38e05e0b76219875c9-item7-chunk0001',
 'text': 'Recent Developments, Future Objectives and Challenges \n\nDemand and Supply, Product Transitions, and New Products and Business Models \n\nDemand for our data center systems and products surged in fiscal year 2024. Entering fiscal year 2025, we are gathering customer demand indications across several product transitions. We have demand visibility for our new data center products ramping later in fiscal year 2025. We have increased our supply and capacity purchases with existing suppliers, added new vendors and entered into prepaid manufacturing and capacity agreements. These increased purchase volumes, the number of suppliers, and the integration of new vendors into our supply chain may create more complexity and execution risk. Our purchase commitments and obligations for inventory and manufacturing capacity at the end of fiscal year 2024 were impacted by shortening lead times for certain components. We may cont

In [69]:
print(first_chunk_info['chunkId'],)
print(next_chunk_info['chunkId'])

4a73b69083f93d38e05e0b76219875c9-item7-chunk0000
4a73b69083f93d38e05e0b76219875c9-item7-chunk0001


**Return a window of three chunks**

In [70]:
cypher = """
    MATCH (c1:Chunk)-[:NEXT]->(c2:Chunk)-[:NEXT]->(c3:Chunk) 
        WHERE c2.chunkId = $chunkIdParam
    RETURN c1.chunkId, c2.chunkId, c3.chunkId
    """

kg.query(cypher,
         params={'chunkIdParam': next_chunk_info['chunkId']})

[{'c1.chunkId': '4a73b69083f93d38e05e0b76219875c9-item7-chunk0000',
  'c2.chunkId': '4a73b69083f93d38e05e0b76219875c9-item7-chunk0001',
  'c3.chunkId': '4a73b69083f93d38e05e0b76219875c9-item7-chunk0002'}]

**windowPathLength**
- number of relationships in the path

In [71]:
cypher = """
    MATCH window = (c1:Chunk)-[:NEXT]->(c2:Chunk)-[:NEXT]->(c3:Chunk) 
        WHERE c1.chunkId = $chunkIdParam
    RETURN length(window) as windowPathLength
    """

kg.query(cypher,
         params={'chunkIdParam': next_chunk_info['chunkId']})

[{'windowPathLength': 2}]

**Finding variable length windows**

In [72]:
# Modify ```NEXT``` relationship to have variable length

cypher = """
  MATCH window=
      (:Chunk)-[:NEXT*0..1]->(c:Chunk)-[:NEXT*0..2]->(:Chunk) 
    WHERE c.chunkId = $chunkIdParam
  RETURN length(window)
  """

kg.query(cypher,
         params={'chunkIdParam': next_chunk_info['chunkId']})

[{'length(window)': 0},
 {'length(window)': 1},
 {'length(window)': 2},
 {'length(window)': 1},
 {'length(window)': 2},
 {'length(window)': 3}]

In [73]:
# retrieve only the longest path

cypher = """
  MATCH window=
      (:Chunk)-[:NEXT*0..1]->(c:Chunk)-[:NEXT*0..1]->(:Chunk)
    WHERE c.chunkId = $chunkIdParam
  WITH window as longestChunkWindow 
      ORDER BY length(window) DESC LIMIT 1
  RETURN length(longestChunkWindow)
  """

kg.query(cypher,
         params={'chunkIdParam': next_chunk_info['chunkId']})

[{'length(longestChunkWindow)': 2}]

In [74]:
prettychain("""What risk factors will impact Nvidia and why?
            If you don't know, just say you don't know.""")

Nvidia's risk factors include demand and supply challenges,
product transitions, and new products and business models.
These risks could impact the company's financial condition
and results of operations. Additionally, the termination of
the Arm Share Purchase Agreement resulted in a termination
cost of $1.4 billion in fiscal year 2023. Nvidia operates in
two segments: Compute & Networking and Graphics, addressing
four large markets where its computing platforms can provide
significant acceleration for applications.


In [75]:
prettychain("""How did Nvidia do in terms of its financial performance in 2023?
            If you don't know, just say you don't know.""")

I don't have the specific information about Nvidia's
financial performance in 2023. The provided content includes
financial statements and reports up to the year 2024, but it
does not include the financial performance details for the
year 2023.


In [19]:
prettychain("""How are Nvidia Corporation and Registrant related""")

Nvidia Corporation and Registrant are the same entity. The
term "Registrant" is used to refer to Nvidia Corporation in
the provided document, which is a filing made by Nvidia
Corporation with the U.S. Securities and Exchange Commission
(SEC).


In [20]:
prettychain("""How are Nvidia and Registrant related""")

Based on the provided content, NVIDIA Corporation is a
company that operates in the technology sector, specializing
in markets where its computing platforms can provide
significant acceleration for applications. The company
operates in four large markets: Data Center, Gaming,
Professional Visualization, and Automotive. There is no
direct mention of a "Registrant" in the provided content,
and the relationship between NVIDIA and a "Registrant" is
not clear from the context.


In [17]:
prettychain("""How are Nvidia and Xiaomi related""")

There is no direct relationship between Nvidia and Xiaomi
mentioned in the provided sources.
