In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
import os
import torch

if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

CUDA device: NVIDIA GeForce RTX 4050 Laptop GPU


## data Loading

In [2]:
def data_loading(filetype, folder = "../data/pdf"):
    if(filetype == "pdf"):
        loader = PyPDFLoader(folder)
        
    docs = loader.load()
    
    return docs

## Chuck Spliting

In [3]:
def Chuck_Spliting(docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    # print(f"split document into {len(splits)} part")
    # print(splits)
    return splits

## Embedding & retriever

In [4]:
from embedded_utils import get_embedding_model
embedding = get_embedding_model()

INFO: Loading embedding model utility...
INFO: Initializing HuggingFaceEmbeddings model: all-MiniLM-L6-v2


  embeddings = HuggingFaceEmbeddings(


In [5]:
def q_fin_dict():
    if not os.path.exists("../vectorDB/q-fin.topic"):
        os.makedirs("../vectorDB/q-fin.topic")
        # print(f"Created directory: q-fin.topic")
    
    q_fin_dict = {
        'Risk Management': 'q-fin.RM',
        'Computational Finance': 'q-fin.CP',
        'Statistical Finance': 'q-fin.ST',
        'Trading and Market Microstructure': 'q-fin.TR',
        'Economics': 'q-fin.EC',
        'General Finance': 'q-fin.GN',
        'Mathematical Finance': 'q-fin.MF',
        'Portfolio Management': 'q-fin.PM',
        'Pricing of Securities': 'q-fin.PR'
    }
    key_list = [key for key in q_fin_dict]
    vector_store = Chroma.from_texts(
        texts=key_list,
        embedding=embedding,
        persist_directory="../vectorDB/q-fin.topic"
    )
q_fin_dict()

In [6]:
dir = '../data'
# print(f"Recursively listing all items in: {os.path.abspath(dir)}\n")

for dirpath, dirnames, filenames in os.walk(dir):
    if(len(dirnames) != 0): continue
    docname = dirpath.split('/')[-1]
    # print(f"docname : ", docname)
    des_doc = "../vectorDB" + dirpath[7:]

    for name in filenames:
        des_der = f"{des_doc}/{name}"

        if not os.path.exists(des_der):
            os.makedirs(des_der)
        
        try:
            folder = os.path.join(dirpath,name)
            split = name.split(".")
            print(split)
            filetype = split[1]
            filename = split[0]
            docs = data_loading(folder = folder, filetype=filetype)
            splits = Chuck_Spliting(docs)
            texts_from_splits = [doc.page_content for doc in splits]
            metadatas_from_splits = [doc.metadata for doc in splits]
            
            # vector transform
            print(des_der)
            vector_store = Chroma.from_texts(
                texts=texts_from_splits,
                embedding=embedding,
                metadatas=metadatas_from_splits,
                persist_directory=des_der
            )
        except:
            print(f"skip {name}")
    print("-" * 20)


['PrinciplesofFinance-WEB', 'pdf']
../vectorDB\base/PrinciplesofFinance-WEB.pdf
--------------------
['Americanoptionsvaluationintime-dependentjump-diffusionmodelsviaintegralequationsandcharacteristicfunctions', 'pdf']
../vectorDB\q-fin.CP/Americanoptionsvaluationintime-dependentjump-diffusionmodelsviaintegralequationsandcharacteristicfunctions.pdf
['EmpiricalModelsoftheTimeEvolutionofSPXOptionPrices', 'pdf']
../vectorDB\q-fin.CP/EmpiricalModelsoftheTimeEvolutionofSPXOptionPrices.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 75 0 (offset 0)
Ignoring wrong pointing object 93 0 (offset 0)
Ignoring wrong pointing object 95 0 (offset 0)
Ignoring wrong pointing object 97 0 (offset 0)
Ignoring wrong pointing object 99 0 (offset 0)
Ignoring wrong pointing object 101 0 (offset 0)
Ignoring wrong pointing object 176 0 (offset 0)
Ignoring wrong pointing object 178 0 (offset 0)
Ignoring wrong pointing object 180 0 (offset 0)
Ignoring wrong pointing object 185 0 (offset 0)
Ignoring w

--------------------
['AIistheStrategy']
skip AIistheStrategy
['AnAI-poweredToolforCentralBankBusinessLiaisons']
skip AnAI-poweredToolforCentralBankBusinessLiaisons
['Anewequilibrium']
skip Anewequilibrium
['ArtificialIntelligence,LeanStartupMethod,andProductInnovations', 'pdf']
../vectorDB\q-fin.EC/ArtificialIntelligence,LeanStartupMethod,andProductInnovations.pdf
['Identifyingeconomicnarrativesinlargetextcorpora--AnintegratedapproachusingLargeLanguageModels', 'pdf']
../vectorDB\q-fin.EC/Identifyingeconomicnarrativesinlargetextcorpora--AnintegratedapproachusingLargeLanguageModels.pdf
['OptimalRegulationandInvestmentIncentivesinFinancialNetworks', 'pdf']
../vectorDB\q-fin.EC/OptimalRegulationandInvestmentIncentivesinFinancialNetworks.pdf
['SocialGroupBiasinAIFinance', 'pdf']
../vectorDB\q-fin.EC/SocialGroupBiasinAIFinance.pdf
['SocialMediaCanReduceMisinformationWhenPublicScrutinyisHigh', 'pdf']
../vectorDB\q-fin.EC/SocialMediaCanReduceMisinformationWhenPublicScrutinyisHigh.pdf
['TheEco