Import libraries

In [7]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import SupabaseVectorStore

import os
from dotenv import load_dotenv
from supabase.client import Client, create_client

Load the PDF file

In [2]:
loader = PyMuPDFLoader("Algebra-and-Trigonometry-2e-WEB.pdf", extract_images=True)
pages = loader.load()

In [3]:
pages[12].page_content

'Pedagogical Foundations and Features\nLearning Objectives\nEach chapter is divided into multiple sections (or modules), each of which is organized around a set of learning\nobjectives. The learning objectives are listed explicitly at the beginning of each section and are the focal point of every\ninstructional element\nNarrative text\nNarrative text is used to introduce key concepts, terms, and definitions, to provide real-world context, and to provide\ntransitions between topics and examples. Throughout this book, we rely on a few basic conventions to highlight the\nmost important ideas:\n•\nKey terms are boldfaced, typically when first introduced and/or when formally defined.\n•\nKey concepts and definitions are called out in a blue box for easy reference.\nExamples\nEach learning objective is supported by one or more worked examples, that demonstrate the problem-solving\napproaches that students must master. The multiple Examples model different approaches to the same type of probl

Split text into smaller chunks

In [4]:
full_text = ""

for page in pages:
    full_text += page.page_content

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len
)

chunks = text_splitter.split_text(text=full_text)

Embedding

https://python.langchain.com/v0.2/docs/integrations/vectorstores/supabase/

In [5]:
model_name = "mixedbread-ai/mxbai-embed-large-v1"
model_kwargs = {'device': 'cpu'}

hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [6]:
load_dotenv()

PUBLIC_SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL')
PUBLIC_SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY')
supabase: Client = create_client(PUBLIC_SUPABASE_URL, PUBLIC_SUPABASE_ANON_KEY)

In [10]:
# vector_store = SupabaseVectorStore.from_texts(
#     chunks,
#     embedding=hf,
#     client=supabase,
#     table_name="documents",
#     query_name="match_documents",
#     chunk_size=512
# )

# If you already have documents with embeddings in your database, simply instantiate a new SupabaseVectorStore directly:
vector_store = SupabaseVectorStore(
    embedding=hf,
    client=supabase,
    table_name="documents",
    query_name="match_documents",
)