#### 0. Setup

In [15]:
import pandas as pd
import os

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

import config 

OPENAI_APIKEY = config.OPENAI_API_KEY
embeddings_model = OpenAIEmbeddings(api_key=OPENAI_APIKEY, model='text-embedding-3-large', max_retries=100, chunk_size=16, show_progress_bar=False)


#### 1. Extract information from Text

In [16]:
file_path = (
    "data/2312.10997.pdf"
)
loader = PyPDFLoader(file_path)
docs = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=64)
text_splits = text_splitter.split_documents(docs)

#### 2. Extract information from Image

In [24]:
##extrac image from pdf
# from util.extract_image_pdf import extract
# extract("data/2312.10997.pdf", "data/image")

##generate image description
# import pandas as pd
# from util.get_image_description import get_description
# images = os.listdir("data/image")
# images_filepath = [os.path.join("data/image", image) for image in images]

# df = pd.DataFrame(columns=["image", "description"])
# df["source"] = images_filepath
# df["description"] = df["image"].apply(get_description)
# df.to_csv("data/image_description.csv", index=False)

##load image description
from langchain_community.document_loaders import DataFrameLoader
df = pd.read_csv("data/image_description.csv")
records = DataFrameLoader(df, 'description','pandas')

text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=64)
image_splits = text_splitter.split_documents(records.load())

#### 3. Upsert

In [33]:
print(len(text_splits))
print(len(image_splits))
print(len(text_splits+image_splits))


726
77
803


In [35]:
vectorstore = Chroma.from_documents(documents=text_splits+image_splits, embedding=embeddings_model, persist_directory="rag_db")