In [67]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

## data Loading

In [None]:
def data_loading(filename, filetype):
    if(filetype == "pdf"):
        loader = PyPDFLoader(f"../data/pdf/{filename}.{filetype}")
        
    docs = loader.load()
    print(f"This Docs has {len(docs)} pages")
    
    return docs

## Chuck Spliting

In [69]:
def Chuck_Spliting(docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    print(f"split document into {len(splits)} part")
    print(splits)
    return splits

## Embedding & retriever

In [70]:
def Embedding_Model(
    model_name = "all-MiniLM-L6-v2",
    model_kwargs = {"device": 'cpu'},
    encode_kwargs = {'normalize_embeddings': False}
):
    embeddings = HuggingFaceEmbeddings(
        model_name = model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    return embeddings

In [71]:
filename = "test01"
filetype = "pdf"

docs = data_loading(filename, filetype)
splits = Chuck_Spliting(docs)
texts_from_splits = [doc.page_content for doc in splits]
metadatas_from_splits = [doc.metadata for doc in splits]
embedding = Embedding_Model()

This Docs has 28 pages
split document into 68 part


## Vector transform

In [72]:
vector_store = Chroma.from_texts(
    texts=texts_from_splits,
    embedding=embedding,
    metadatas=metadatas_from_splits,
    persist_directory=f"../vectorDB/{filename}"
)

In [73]:
retrieved_data = vector_store.get(include=["embeddings", "documents","metadatas"])