In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from pydantic import BaseModel


import os
import tempfile
import streamlit as st
import pandas as pd
from dotenv import load_dotenv

In [4]:
load_dotenv()

True

In [5]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [6]:
llm = ChatOpenAI(model = "gpt-4o-mini") # api key will be read from the environment variable

In [9]:
loader = PyPDFLoader("../data/Sample.pdf")
pages = loader.load()

Ignoring wrong pointing object 7 0 (offset 0)


In [17]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 200, length_function = len, separators = ["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)
# if chunks are too big they might contain redundant information, but too small might not contain enough information for llm to generate enough information

In [18]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model = "text-embedding-ada-002", openai_api_key = OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector = embedding_function.embed_query("name")

In [19]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator = "embedding_distance", embeddings = embedding_function)


Vector database: Chroma, its open source
documents -> chunks -> embeddings -> vector data base -> search engine

In [22]:
import uuid
import re

def clean_filename(filename):
    

    new_filename = re.sub(r'\s\(\d+\)', '', filename)
    return new_filename

def create_vectorstore(chunks, embedding_function, file_name, vector_store_path="db"):


    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk)        

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        collection_name=clean_filename(file_name),
                                        embedding=embedding_function, 
                                        ids=list(unique_ids), 
                                        persist_directory = vector_store_path)

    # The database should save automatically after we create it
    # but we can also force it to save using the persist() method
    vectorstore.persist()
    
    return vectorstore