In [5]:
from pathlib import Path

DATA_PATH = "/Users/andreimagno/HackathonSDSU/backend"
p = Path(DATA_PATH)

print("Directory exists:", p.exists())
print("PDFs in folder:", list(p.glob("*.pdf")))
print("All files:", [f.name for f in p.iterdir()])

Directory exists: True
PDFs in folder: []
All files: ['.DS_Store', 'app', 'uv.lock', 'pyproject.toml', 'README.md', '.env', '.python-version', 'main.py']


In [6]:
from langchain_community.document_loaders import PyPDFDirectoryLoader

DATA_PATH = "/Users/andreimagno/HackathonSDSU/backend"

def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

In [3]:
documents = load_documents()
print(documents[2])
#the only thing is that it does not scrap the 
# images from the pdfs so i need to figure this out#

page_content='2 
 
TABLE OF CONTENTS 
 
GENERAL INFORMATION  _____________________________________________________________________ pg. 4 
 Student Organization Banking Policy 
 Starting the Student Organization Recognition Process 
 A.S. Banking Contact Information 
BANKING REQUIREMENTS  ___________________________________________________________________ pg. 5 
 Banking with Associated Students 
 Banking Options Benefits 
ON CAMPUS BANKING  ________________________________________________________________________ pg. 6 
 Banking Training 
 Account Application & Agreement Form 
OFF CAMPUS BANKING  _______________________________________________________________________ pg. 7 
 IRS 990 
 National Chapter Affiliation Letter 
CENTER FOR STUDENT ORGANIZATIONS & ACTIVITIES  ________________________________ pg. 8 
 General Information  
 Student Life Advising 
 Contact Information 
FINANCE  _________________________________________________________________________________ pgs. 9 - 15 
 Deposit a

In [7]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [8]:
documents = load_documents()
chunks = split_documents(documents)
print(f"Number of chunks: {len(chunks)}")
print(chunks[3])


Number of chunks: 68
page_content='IRS 990 
 National Chapter Affiliation Letter 
CENTER FOR STUDENT ORGANIZATIONS & ACTIVITIES  ________________________________ pg. 8 
 General Information  
 Student Life Advising 
 Contact Information 
FINANCE  _________________________________________________________________________________ pgs. 9 - 15 
 Deposit and Revenue 
 Purchases and Reimbursements 
 Check Request Deadline 
 Commonly Used Expense Codes 
 Donation, Fundraising, and Gifts in Kind 
 Account Information Request/Statement 
 Financial Tracking 
STUDENT ORGANIZATION FUNDING  _______________________________________________ pgs. 16 - 17 
General Information 
Funding Opportunities 
Reimbursements and Purchasing' metadata={'producer': 'Adobe PDF Library 24.3.86', 'creator': 'Acrobat PDFMaker 24 for Word', 'creationdate': '2024-09-03T11:20:44-07:00', 'author': 'Minh Pham', 'comments': '', 'company': '', 'keywords': '', 'moddate': '2024-11-07T10:07:15-08:00', 'sourcemodified': 'D:202409031

In [9]:
from langchain_community.embeddings import HuggingFaceEmbeddings
def get_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="intfloat/e5-base-v2")
    return embeddings

In [10]:
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import SupabaseVectorStore
from supabase import create_client
from dotenv import load_dotenv
import os

load_dotenv()

SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
SUPABASE_TABLE_NAME = "banking_handbook"
supabase_client = create_client(SUPABASE_URL, SUPABASE_KEY)



In [11]:
import re
from langchain_core.documents import Document

# Remove NUL and other non-printable control chars (keep \t \n \r)
_CTRL_CHARS = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F]")

def clean_text(s: str) -> str:
    if not isinstance(s, str):
        s = str(s)
    s = s.replace("\u0000", "").replace("\x00", "")
    s = _CTRL_CHARS.sub(" ", s)
    return s

def clean_meta(m: dict) -> dict:
    out = {}
    for k, v in (m or {}).items():
        if isinstance(v, str):
            out[k] = clean_text(v)
        else:
            # make sure metadata is JSON-serializable strings/nums/bools
            try:
                out[k] = v
            except Exception:
                out[k] = str(v)
    return out

def sanitize_chunks(chunks: list[Document]) -> list[Document]:
    cleaned = []
    for d in chunks:
        cleaned.append(
            Document(
                page_content=clean_text(d.page_content),
                metadata=clean_meta(d.metadata),
            )
        )
    return cleaned

In [12]:
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_core.documents import Document

def add_to_supabase(chunks: list[Document], ids: list[str] | None = None):
    vector_store = SupabaseVectorStore(
        client=supabase_client,
        table_name="banking_handbook",
        embedding=get_embeddings(),
    )
    vector_store.add_documents(chunks, ids=ids)
    print(f"✅ Added {len(chunks)} chunks to '{SUPABASE_TABLE_NAME}'")
sanitized_chunks = sanitize_chunks(chunks)
add_to_supabase(sanitized_chunks)

  embeddings = HuggingFaceEmbeddings(model_name="intfloat/e5-base-v2")


KeyboardInterrupt: 

In [13]:
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate

PROMPT_TEMPLATE = """
You are an SDSU AI companion designed to assist students in registered student organizations
by providing accurate and helpful information based on the SDSU Banking Handbook.
Answer ONLY using the context. If the answer cannot be found in the context, say you don't know.

Context:
{context}

Question:
{question}

Answer:
""".strip()

def query_supabase_rpc(query_text: str, k: int = 5):
    # 1) embed the query with your local HF model
    emb = get_embeddings().embed_query(query_text)  # list[float], len=768

    # 2) call the pgvector RPC
    resp = supabase_client.rpc(
        "match_banking_handbook",
        {"query_embedding": emb, "match_count": k}
    ).execute()

    rows = resp.data or []
    docs = [
        Document(page_content=r["content"], metadata=(r.get("metadata") or {}))
        for r in rows
    ]

    # 3) build context & prompt
    parts = []
    for d in docs:
        src = d.metadata.get("source", "unknown")
        pg  = d.metadata.get("page", "–")
        parts.append(f"[source: {src} | page: {pg}]\n{d.page_content}")
    context_text = "\n\n---\n\n".join(parts)

    prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE).format(
        context=context_text, question=query_text
    )
    return prompt, docs


In [14]:
q = "recognition requirements for student organizations; Starting the Student Organization Recognition Process; prerequisites; training; forms"
prompt, results = query_supabase_rpc(q, k=3)
print(prompt)

Human: You are an SDSU AI companion designed to assist students in registered student organizations
by providing accurate and helpful information based on the SDSU Banking Handbook.
Answer ONLY using the context. If the answer cannot be found in the context, say you don't know.

Context:
[source: /Users/andreimagno/HackathonSDSU/backend/data/BankingHandbook-24-25.pdf | page: 4]
4 
 
GENERAL INFORMATION 
 
This Banking Handbook will provide information regarding the RSO recognition process as well as 
banking procedures and A.S. funding opportunities for student organizations. 
For an organization to become officially recognized, there are two requirements:  
Requirement 1: Complete banking requirements through Associated Students. 
Requirement 2: Complete recognition requirements through the Center for Student 
Organizations and Activities. 
 
STUDENT ORGANIZATION BANKING POLICY  
Effective July 1, 2016 California State University guidelines require Recognized Student 
Organizations (R