# Outline for things to do

## 1. Loading documents for each dataset into Pinecone
- Use dataset name as metadata -> Add 1 column besides the query so that we can search by metadata -> Faster and more accurate

## 2. Create a dataframe of query:
- Columns: query_id, query, dataset_name

## 3. Things to test:

### 3.1 Simple RAG submission:
- Use raw vectore store to retrieve the top-10 most relevant documents to the query

### 3.2 Vector Search + RAG Fusion

### 3.3 Hybrid search + Reranking



In [1]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from financerag.task import *
from financerag.retrieval import DenseRetrieval

In [2]:
load_dotenv(".env")
GG_API_KEY = os.environ.get('GOOGLE_API_KEY')
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
LANGSMITH_API_KEY = os.environ.get('LANGSMITH_API_KEY')

In [11]:
# Set up vector database
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
embeddings = GoogleGenerativeAIEmbeddings(model = "models/text-embedding-004")
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [5]:
# Get dataset name first
import os
dataset_names = []
for f in os.listdir('finance_dataset'):
    if f.endswith("tsv"):
       dataset_names.append(f.split('_')[0])
dataset_names  

['MultiHeirtt',
 'FinQA',
 'FinanceBench',
 'ConvFinQA',
 'FinQABench',
 'TATQA',
 'FinDER']

In [6]:
for dataset_name in dataset_names:
    task_variable = f"{dataset_name.lower()}_task"
    script_string = f"""
    # {dataset_name} Task
    {task_variable} = {dataset_name}Task()
    {task_variable}.load()
    {task_variable}_retriever = DenseRetrieval(vector_store = vector_store, dataset_name = {task_variable}.metadata.dataset_name)
    {task_variable}_retriever.load_corpus_for_searching({task_variable}.corpus, saved_index = True)
    {task_variable}.retrieve(retriever = {task_variable}_retriever)
    {task_variable}.save_retrieved_results()
    """
    print(script_string)


    # MultiHeirtt Task
    multiheirtt_task = MultiHeirttTask()
    multiheirtt_task.load()
    multiheirtt_task_retriever = DenseRetrieval(vector_store = vector_store, dataset_name = multiheirtt_task.metadata.dataset_name)
    multiheirtt_task_retriever.load_corpus_for_searching(multiheirtt_task.corpus, saved_index = True)
    multiheirtt_task.retrieve(retriever = multiheirtt_task_retriever)
    multiheirtt_task.save_retrieved_results()
    

    # FinQA Task
    finqa_task = FinQATask()
    finqa_task.load()
    finqa_task_retriever = DenseRetrieval(vector_store = vector_store, dataset_name = finqa_task.metadata.dataset_name)
    finqa_task_retriever.load_corpus_for_searching(finqa_task.corpus, saved_index = True)
    finqa_task.retrieve(retriever = finqa_task_retriever)
    finqa_task.save_retrieved_results()
    

    # FinanceBench Task
    financebench_task = FinanceBenchTask()
    financebench_task.load()
    financebench_task_retriever = DenseRetrieval(vector_store = vector_st

In [9]:
# MultiHeirtt Task
multiheirtt_task = MultiHeirttTask()
multiheirtt_task.load()
multiheirtt_task_retriever = DenseRetrieval(vector_store = vector_store, dataset_name = multiheirtt_task.metadata.dataset_name)
multiheirtt_task_retriever.load_corpus_for_searching(multiheirtt_task.corpus, saved_index = True)
multiheirtt_task.retrieve(retriever = multiheirtt_task_retriever)
multiheirtt_task.save_retrieved_results()


Loading document:: 100%|██████████| 10475/10475 [00:00<00:00, 42579.02it/s]


Successfully saved index of vector store to path : faiss_index/multiheirtt_index
Saved result successfully to ./financerag_result/multiheirtt_result.csv!


In [12]:
# FinQA Task
finqa_task = FinQATask()
finqa_task.load()
finqa_task_retriever = DenseRetrieval(vector_store = vector_store, dataset_name = finqa_task.metadata.dataset_name)
finqa_task_retriever.load_corpus_for_searching(finqa_task.corpus, saved_index = True)
finqa_task.retrieve(retriever = finqa_task_retriever)
finqa_task.save_retrieved_results()


Loading document: 100%|██████████| 2789/2789 [00:00<00:00, 151775.10it/s]


Successfully saved index of vector store to path : faiss_index/finqa_index


Retrieving result:: 100%|██████████| 1147/1147 [07:21<00:00,  2.60it/s]

Saved result successfully to ./financerag_result/finqa_result.csv!





In [13]:

# FinanceBench Task
financebench_task = FinanceBenchTask()
financebench_task.load()
financebench_task_retriever = DenseRetrieval(vector_store = vector_store, dataset_name = financebench_task.metadata.dataset_name)
financebench_task_retriever.load_corpus_for_searching(financebench_task.corpus, saved_index = True)
financebench_task.retrieve(retriever = financebench_task_retriever)
financebench_task.save_retrieved_results()




Loading document: 100%|██████████| 180/180 [00:00<00:00, 131942.45it/s]


Successfully saved index of vector store to path : faiss_index/financebench_index


Retrieving result:: 100%|██████████| 150/150 [01:02<00:00,  2.42it/s]

Saved result successfully to ./financerag_result/financebench_result.csv!





In [14]:
# ConvFinQA Task
convfinqa_task = ConvFinQATask()
convfinqa_task.load()
convfinqa_task_retriever = DenseRetrieval(vector_store = vector_store, dataset_name = convfinqa_task.metadata.dataset_name)
convfinqa_task_retriever.load_corpus_for_searching(convfinqa_task.corpus, saved_index = True)
convfinqa_task.retrieve(retriever = convfinqa_task_retriever)
convfinqa_task.save_retrieved_results()


Loading document: 100%|██████████| 2066/2066 [00:00<00:00, 129481.68it/s]


Successfully saved index of vector store to path : faiss_index/convfinqa_index


Retrieving result:: 100%|██████████| 421/421 [02:37<00:00,  2.67it/s]

Saved result successfully to ./financerag_result/convfinqa_result.csv!





In [15]:

# FinQABench Task
finqabench_task = FinQABenchTask()
finqabench_task.load()
finqabench_task_retriever = DenseRetrieval(vector_store = vector_store, dataset_name = finqabench_task.metadata.dataset_name)
finqabench_task_retriever.load_corpus_for_searching(finqabench_task.corpus, saved_index = True)
finqabench_task.retrieve(retriever = finqabench_task_retriever)
finqabench_task.save_retrieved_results()


Loading document: 100%|██████████| 92/92 [00:00<00:00, 89592.75it/s]


Successfully saved index of vector store to path : faiss_index/finqabench_index


Retrieving result:: 100%|██████████| 100/100 [00:36<00:00,  2.75it/s]

Saved result successfully to ./financerag_result/finqabench_result.csv!





In [16]:

# TATQA Task
tatqa_task = TATQATask()
tatqa_task.load()
tatqa_task_retriever = DenseRetrieval(vector_store = vector_store, dataset_name = tatqa_task.metadata.dataset_name)
tatqa_task_retriever.load_corpus_for_searching(tatqa_task.corpus, saved_index = True)
tatqa_task.retrieve(retriever = tatqa_task_retriever)
tatqa_task.save_retrieved_results() 


Loading document: 100%|██████████| 2756/2756 [00:00<00:00, 182551.12it/s]


Successfully saved index of vector store to path : faiss_index/tatqa_index


Retrieving result:: 100%|██████████| 1663/1663 [15:14<00:00,  1.82it/s] 

Saved result successfully to ./financerag_result/tatqa_result.csv!





In [17]:

# FinDER Task
finder_task = FinDERTask()
finder_task.load()
finder_task_retriever = DenseRetrieval(vector_store = vector_store, dataset_name = finder_task.metadata.dataset_name)
finder_task_retriever.load_corpus_for_searching(finder_task.corpus, saved_index = True)
finder_task.retrieve(retriever = finder_task_retriever)
finder_task.save_retrieved_results()

Loading document: 100%|██████████| 13862/13862 [00:00<00:00, 20056.98it/s]


Successfully saved index of vector store to path : faiss_index/finder_index


Retrieving result:: 100%|██████████| 216/216 [01:57<00:00,  1.84it/s]

Saved result successfully to ./financerag_result/finder_result.csv!





In [None]:
1: Recusplitter(chunk_size = 1000, overlap_size = 20)
2 :SemanticSplitter()Hello how are you

1: hello,how , are you
2. hello | how are you

In [19]:
final_result = pd.DataFrame(columns = ['query_id', 'corpus_id'])

for dataset_name in dataset_names:
    df = pd.read_csv(f"financerag_result/{dataset_name.lower()}_result.csv")
    final_result = pd.concat([final_result, df], axis = 0)

final_result

Unnamed: 0,query_id,corpus_id
0,q82d4c6ec,d8e404704
1,q82d4c6ec,d87914156
2,q82d4c6ec,d8a3219a8
3,q82d4c6ec,d8c22a07a
4,q82d4c6ec,d88e1c5b2
...,...,...
2139,q00218,GOOGL20231203
2140,q00218,JPM20234729
2141,q00218,V20231584
2142,q00218,JPM20237319


In [20]:
pd.read_csv('finance_dataset/sample_submission_.csv')

Unnamed: 0,query_id,corpus_id
0,qd496c6a0,dd4b92b32
1,qd496c6a0,dd4ba2a5a
2,qd496c6a0,dd4be1f98
3,qd496c6a0,dd4ba07d2
4,qd496c6a0,dd4ba02f0
...,...,...
46681,q1a741e68,d1b3afaba
46682,q1a741e68,d1b34e18e
46683,q1a741e68,d1b36065e
46684,q1a741e68,d1b33d05a


In [21]:
final_result.to_csv('submission.csv', index = False)

In [22]:
final_result.drop_duplicates(subset = ['query_id'])

Unnamed: 0,query_id,corpus_id
0,q82d4c6ec,d8e404704
10,q855a35a0,d89a6ea36
20,q85384530,d8ce7fc30
30,q842c8af2,d88465f0a
40,q85451756,d8646ec7e
...,...,...
2094,q00214,BRK.A20230009
2104,q00215,BRK.A20230404
2114,q00216,BRK.A20232401
2124,q00217,BRK.A20230062


In [28]:
final_queries = pd.DataFrame(columns = ['_id', 'title', 'text'])

for dataset_name in dataset_names:
    df = pd.read_json(f"finance_dataset/{dataset_name.lower()}_queries.jsonl/queries.jsonl", lines = True)
    final_queries = pd.concat([final_queries, df], axis = 0)

final_queries

Unnamed: 0,_id,title,text
0,q82d4c6ec,,What was the sum of Fourth Quarter without tho...
1,q855a35a0,,In which section is Interest income smaller th...
2,q85384530,,If Total Forward Hedged Revenues develops with...
3,q842c8af2,,what was the ratio of the purchase in december...
4,q85451756,,what is the highest total amount of segment in...
...,...,...,...
211,q00214,,How many distinct insurance underwriting group...
212,q00215,,What is the ticker symbol for Berkshire Hathaw...
213,q00216,,What is the largest operating segment of the B...
214,q00217,,Source of invested assets of insurance busines...
