In [3]:
import requests
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Fetch and extract text from a URL
def fetch_and_extract(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open("temp.pdf", "wb") as f:
            f.write(response.content)
        reader = PdfReader("temp.pdf")
        text = "".join(page.extract_text() or "" for page in reader.pages)
        return text
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""

# List of BC policy URLs (expand this list)
bc_policy_urls = [
    "https://universitycounsel.ubc.ca/files/2022/05/Research-Policy_LR2.pdf",  # UBC Research Policy
    "https://universitycounsel.ubc.ca/files/2022/05/Research-Over-Expenditure-Policy_FM4.pdf",  # UBC Strategic Plan
    "https://universitycounsel.ubc.ca/files/2022/05/Contract-Employees-Fund-Policy_FM7.pdf",  # 
    "https://universitycounsel.ubc.ca/files/2024/11/Financial-Investigations-Policy_SC15-Consultation-Draft.pdf", 
    "https://universitycounsel.ubc.ca/files/2022/05/Financial-Aid-Policy_LR10.pdf", # ubc financial aid policy
    "https://www.sfu.ca/content/dam/sfu/finance/publications-news/publications/budgetbook/2024-25%20SFU%20Budget_Final%20Apr%2012.pdf",
    "https://www.sfu.ca/content/dam/sfu/finance/publications-news/publications/annualreport/financial_report_2024_v16.pdf",
    "https://www.sfu.ca/content/dam/sfu/policies/files/academic_policies/10_series/A10-01.pdf",
    "https://www.sfu.ca/content/dam/sfu/policies/files/academic_policies/10_series/A10-02.pdf",
    "https://www.sfu.ca/content/dam/sfu/policies/files/academic_policies/10_series/A10-03.pdf",
    "https://www.sfu.ca/content/dam/sfu/policies/files/academic_policies/10_series/A10-06.pdf",
    "https://www.sfu.ca/content/dam/sfu/policies/files/academic_policies/12_series/A12-04.pdf",
    "https://www.sfu.ca/content/dam/sfu/policies/files/academic_policies/13_series/A13-03.pdf",
    "https://www.sfu.ca/content/dam/sfu/policies/files/academic_policies/32_series/A32.01%20Policy%20-%20Awards%20for%20Excellence%20in%20Teaching%20-%2001sept22.pdf",
    "https://www.sfu.ca/content/dam/sfu/policies/files/administrative_policies/3_series/AD3-10.pdf",
    "https://www.sfu.ca/content/dam/sfu/policies/files/administrative_policies/9_series/AD9-01.pdf",
    "https://www.sfu.ca/content/dam/sfu/policies/files/administrative_policies/9_series/AD9-09.pdf",
    'https://www.douglascollege.ca/sites/default/files/docs/governance/A19%20Bullying%20and%20Harassment%20Prevention%20and%20Response_0.pdf',
    'https://www.douglascollege.ca/sites/default/files/docs//Academic%20Integrity%20Policy.pdf',
    'https://www.douglascollege.ca/sites/default/files/docs//Academic%20Performance%20Policy.pdf',
    'https://www.douglascollege.ca/sites/default/files/docs//English%20Language%20Competency%20Standards%20Policy.pdf',
    'https://www.douglascollege.ca/sites/default/files/docs/finance-dates-and-deadlines/Grading%20Policy%20May%202019.pdf',
    'https://www.douglascollege.ca/sites/default/files/docs//A62%20Investment%20Policy.pdf',
    'https://www.douglascollege.ca/sites/default/files/docs/governance/a20-student-non-academic-misconduct-policy.pdf',
    'https://www.douglascollege.ca/sites/default/files/docs/finance/consolidated-financial-statements-march-31-2024.pdf',
    'https://vpfo-finance-2024.sites.olt.ubc.ca/files/2024/11/2024_25_UBCBudgetReport.pdf?_gl=1*egicjh*_ga*NzgzMzAyODU5LjE3NDI5Mjk2MTI.*_ga_3B1R282RNR*MTc0MjkzNjM3My4xLjAuMTc0MjkzNjM3My4wLjAuMA..', #ubc budget 24/45
    'https://www.uvic.ca/budget/_assets/docs/framework/planning-budget-framework-2025.pdf', #university of victoria budget 24/25
    'https://www.uvic.ca/universitysecretary/assets/docs/policies/HR6100_1100_.pdf',
    'https://www.uvic.ca/universitysecretary/assets/docs/policies/GV0200_1105_.pdf',
    'https://www.uvic.ca/universitysecretary/assets/docs/policies/HR6115_1110_.pdf',
    'https://www.uvic.ca/universitysecretary/assets/docs/policies/GV0205_1150_.pdf',
    'https://www.uvic.ca/universitysecretary/assets/docs/policies/AC1205_2340.pdf'

# Add more URLs from BC institutions (e.g., Langara, BCIT
]

# Process all documents
all_chunks = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

for url in bc_policy_urls:
    text = fetch_and_extract(url)
    if text:
        chunks = text_splitter.split_text(text)
        # Add source metadata for context
        chunks_with_meta = [f"Source: {url.split('/')[-1]}\n\n{chunk}" for chunk in chunks]
        all_chunks.extend(chunks_with_meta)
        print(f"Processed {url}: {len(chunks)} chunks")

# Create and save vector store
vector_store = FAISS.from_texts(all_chunks, embeddings)
vector_store.save_local("bc_policy_db")
print("Database saved as 'bc_policy_db'")

Processed https://universitycounsel.ubc.ca/files/2022/05/Research-Policy_LR2.pdf: 64 chunks
Processed https://universitycounsel.ubc.ca/files/2022/05/Research-Over-Expenditure-Policy_FM4.pdf: 24 chunks
Processed https://universitycounsel.ubc.ca/files/2022/05/Contract-Employees-Fund-Policy_FM7.pdf: 17 chunks
Processed https://universitycounsel.ubc.ca/files/2024/11/Financial-Investigations-Policy_SC15-Consultation-Draft.pdf: 64 chunks
Processed https://universitycounsel.ubc.ca/files/2022/05/Financial-Aid-Policy_LR10.pdf: 29 chunks
Processed https://www.sfu.ca/content/dam/sfu/finance/publications-news/publications/budgetbook/2024-25%20SFU%20Budget_Final%20Apr%2012.pdf: 117 chunks
Processed https://www.sfu.ca/content/dam/sfu/finance/publications-news/publications/annualreport/financial_report_2024_v16.pdf: 307 chunks
Processed https://www.sfu.ca/content/dam/sfu/policies/files/academic_policies/10_series/A10-01.pdf: 66 chunks
Processed https://www.sfu.ca/content/dam/sfu/policies/files/academ

In [4]:
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any
import requests

class OpenRouterLLM(LLM):
    api_key: str
    model: str = "meta-llama/llama-3.1-8b-instruct"

    def __init__(self, api_key: str, model: str = None, **kwargs):
        super().__init__(api_key=api_key, **kwargs)
        if model:
            self.model = model

    @property
    def _llm_type(self) -> str:
        return "openrouter"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        payload = {
            "model": self.model,
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": 500
        }
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", 
                                json=payload, headers=headers)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"]

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {"model": self.model}

In [5]:
import streamlit as st
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

# Initialize embeddings and LLM
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
api_key = "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"
llm = OpenRouterLLM(api_key=api_key)

# Load the pre-built database
vector_store = FAISS.load_local("bc_policy_db", embeddings, allow_dangerous_deserialization=True)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 3})
)

# Streamlit app
st.title("BC Policy Analyzer")
query = st.text_input("Ask a Question About BC Institutions", value="What can you tell me about funding?")
if st.button("Analyze"):
    with st.spinner("Analyzing across BC institutions..."):
        result = qa_chain.run(query)
        st.write("Answer:", result)

2025-03-25 14:07:54.718 
  command:

    streamlit run /opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]
2025-03-25 14:07:54.719 Session state does not function when running a script without `streamlit run`
