In [10]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
import os

## data Loading

In [11]:
def data_loading(filetype, folder = "../data/pdf"):
    if(filetype == "pdf"):
        loader = PyPDFLoader(folder)
        
    docs = loader.load()
    print(f"This Docs has {len(docs)} pages")
    
    return docs

## Chuck Spliting

In [12]:
def Chuck_Spliting(docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    print(f"split document into {len(splits)} part")
    print(splits)
    return splits

## Embedding & retriever

In [13]:
from embedded_utils import get_embedding_model
embedding = get_embedding_model()

INFO: Initializing HuggingFaceEmbeddings model: all-MiniLM-L6-v2


In [None]:
def q_fin_dict():
    if not os.path.exists("../vectorDB/q-fin.topic"):
        os.makedirs("../vectorDB/q-fin.topic")
        print(f"Created directory: q-fin.topic")
    
    q_fin_dict = {
        'Risk Management': 'q-fin.RM',
        'Computational Finance': 'q-fin.CP',
        'Statistical Finance': 'q-fin.ST',
        'Trading and Market Microstructure': 'q-fin.TR',
        'Economics': 'q-fin.EC',
        'General Finance': 'q-fin.GN',
        'Mathematical Finance': 'q-fin.MF',
        'Portfolio Management': 'q-fin.PM',
        'Pricing of Securities': 'q-fin.PR'
    }
    key_list = [key for key in q_fin_dict]
    vector_store = Chroma.from_texts(
        texts=key_list,
        embedding=embedding,
        persist_directory="../vectorDB/q-fin.topic"
    )
    

Created directory: q_fin_topic


In [14]:
dir = '../data'
print(f"Recursively listing all items in: {os.path.abspath(dir)}\n")

for dirpath, dirnames, filenames in os.walk(dir):
    if(len(dirnames) != 0): continue
    docname = dirpath.split('/')[-1]
    # print(f"docname : ", docname)
    des_doc = "../vectorDB" + dirpath[7:]

    for name in filenames:
        des_der = f"{des_doc}/{name}"
        
        if not os.path.exists(des_der):
            os.makedirs(des_der)
        
        folder = os.path.join(dirpath,name)
        split = name.split(".")
        filetype = split[1]
        filename = split[0]
        docs = data_loading(folder = folder, filetype=filetype)
        splits = Chuck_Spliting(docs)
        texts_from_splits = [doc.page_content for doc in splits]
        metadatas_from_splits = [doc.metadata for doc in splits]
        
        # vector transform
        print(des_der)
        vector_store = Chroma.from_texts(
            texts=texts_from_splits,
            embedding=embedding,
            metadatas=metadatas_from_splits,
            persist_directory=des_der
        )
    print("-" * 20)


Recursively listing all items in: /Users/faiisu/Documents/Project/RAGdevelop/data

This Docs has 22 pages
split document into 57 part
[Document(metadata={'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:)', 'creationdate': '', 'author': 'Eckhard Platen', 'doi': 'https://doi.org/10.48550/arXiv.2506.16264', 'license': 'http://creativecommons.org/licenses/by/4.0/', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'title': 'Pricing under the Benchmark Approach', 'trapped': '/False', 'arxivid': 'https://arxiv.org/abs/2506.16264v1', 'source': '../data/q-fin.GN/PricingundertheBenchmarkApproach.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1'}, page_content='arXiv:2506.16264v1  [q-fin.MF]  19 Jun 2025\nPricing under the Benchmark Approach\nEckhard Platen1,\nJune 23, 2025\nThe paper summarizes key results of the benchmark approach with a\nfocus on the concept of benchmark-neutral pricing. It applies these re-\nsults

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 75 0 (offset 0)
Ignoring wrong pointing object 93 0 (offset 0)
Ignoring wrong pointing object 95 0 (offset 0)
Ignoring wrong pointing object 97 0 (offset 0)
Ignoring wrong pointing object 99 0 (offset 0)
Ignoring wrong pointing object 101 0 (offset 0)
Ignoring wrong pointing object 176 0 (offset 0)
Ignoring wrong pointing object 178 0 (offset 0)
Ignoring wrong pointing object 180 0 (offset 0)
Ignoring wrong pointing object 185 0 (offset 0)
Ignoring w

This Docs has 64 pages
split document into 238 part
[Document(metadata={'producer': 'macOS Version 15.3.2 (Build 24D81) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20250403212453Z00'00'", 'moddate': "D:20250403212453Z00'00'", 'source': '../data/q-fin.EC/ArtificialIntelligence,LeanStartupMethod,andProductInnovations.pdf', 'total_pages': 64, 'page': 0, 'page_label': '1'}, page_content='1 \nLynn Wu The Wharton School, University of Pennsylvania Xiaoning Wang Jindal School of Management, University of Texas at Dallas'), Document(metadata={'producer': 'macOS Version 15.3.2 (Build 24D81) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20250403212453Z00'00'", 'moddate': "D:20250403212453Z00'00'", 'source': '../data/q-fin.EC/ArtificialIntelligence,LeanStartupMethod,andProductInnovations.pdf', 'total_pages': 64, 'page': 0, 'page_label': '1'}, page_content='Artificial Intelligence, Lean Startup Method, and Product Innovations    Abstract: Although AI has the potential t

KeyboardInterrupt: 