In [1]:
import getpass
import os

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
if "SUPABASE_URL" not in os.environ:
    os.environ["SUPABASE_URL"] = getpass.getpass("Supabase URL:")
if "SUPABASE_SERVICE_KEY" not in os.environ:
    os.environ["SUPABASE_SERVICE_KEY"] = getpass.getpass("Supabase Service Key:")

In [2]:
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_openai import OpenAIEmbeddings
from supabase.client import Client, create_client

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")


In [15]:
# Parameters
BATCH_SIZE = 3  # OpenAI recommends ~100 or fewer depending on text length
DELAY_SECONDS = 60  # Avoid rate limit: delay after each batch

# Fetch rows from Supabase
response = (
    supabase.table("courses_new")
    .select("*")
    .is_("embedding", "null")  # only rows with no embedding
    .range(0, 100000)  # optional paging
    .execute()
)

rows = response.data

rows_no_id = [
    {k: v for k, v in row.items() if k != "id" and k != "terms_offered"} for row in rows
]

texts_to_embed = [
    ", ".join(
        f"{key}: {value}" for key, value in r.items() if value is not None
    )
    for r in rows_no_id
]

print(len(response.data))

0


In [None]:
# print(texts_to_embed[0])
# print(texts_to_embed[1])
# import time 
# ids = [row["id"] for row in rows]
# START_BATCH = 0

# # Process in batches
# for i in range(START_BATCH * BATCH_SIZE, len(texts_to_embed), BATCH_SIZE):
#     if i == 0 :
#         print(texts_to_embed[i])
#     batch_texts = texts_to_embed[i : i + BATCH_SIZE]
#     batch_ids = ids[i : i + BATCH_SIZE]

#     try:
#         embeddings = embedding_model.embed_documents(batch_texts)
#     except Exception as e:
#         print(f"Error on batch {i}: {e}")
#         time.sleep(60)
#         continue
    
#     # Upload each embedding back to Supabase
#     for row_id, emb in zip(batch_ids, embeddings):
#         supabase.table("courses_new").update({"embedding": emb}).eq(
#             "id", row_id
#         ).execute()
#     time.sleep(0.5)
#     print(f"Uploaded batch {i // BATCH_SIZE + 1}")


course_name: DANCE 131, subject_name: SOMATIC THEORY AND PRACTICES, description: Integration of body and mind is explored through various somatic practices., prerequisites: None, satisfies: DANCE330, students: 255, avg_gpa: 3.81, grade_a: 193, grade_ab: 32, grade_b: 16, grade_bc: 0, grade_c: 6, grade_d: 0, grade_f: 1
course_name: GENBUS 891, subject_name: TEXT MINING AND GENERATION FOR BUSINESS ANALYTICS, description: An introduction to text mining and generation for business applications. Includes an overview of text data and approaches for making text data useful for descriptive and predictive analytics. Also, includes key applications of natural language processing, such as chatbots and recommender systems., prerequisites: GEN BUS 657(or GEN BUS 888 prior to Fall 2025) andGEN BUS 883, satisfies: , students: 0, grade_a: 0, grade_ab: 0, grade_b: 0, grade_bc: 0, grade_c: 0, grade_d: 0, grade_f: 0
course_name: DANCE 131, subject_name: SOMATIC THEORY AND PRACTICES, description: Integrati

In [16]:
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_openai import OpenAIEmbeddings
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = SupabaseVectorStore(
    client=supabase,
    embedding=embedding_model,
    table_name="courses_new",
    query_name="match_courses_new",  # You must define this Postgres function (explained below)

)


In [29]:
desc_row = supabase.table("courses_new").select("description").eq("course_name", "SLAVIC 102").maybe_single().execute()
query_text = "Deep learning NO GRADUATES"


In [31]:
results = vectorstore.similarity_search(query_text, k=10)
for i, doc in enumerate(results):
    print(f"\nResult #{i + 1}")
    print(doc)

print(results[-1])


Result #1
page_content='STAT 453' metadata={'course_name': 'STAT 453', 'description': "Deep learning is a field that specializes in discovering and extracting intricate structures in large, unstructured datasets for parameterizing artificial neural networks with many layers. Since deep learning has pushed the state-of-the-art in many research and application areas, it's become indispensable for modern technology. Focuses on a understanding deep, artificial neural networks by connecting it to related concepts in statistics. Beyond covering deep learning models for predictive modeling, focus on deep generative models. Besides explanations on a mathematical and conceptual level, emphasize the practical aspects of deep learning. Open-source computing provides hands-on experience for implementing deep neural nets, working on supervised learning tasks, and applying generative models for dataset synthesis.", 'subject_name': 'INTRODUCTION TO DEEP LEARNING AND GENERATIVE MODELS'}

Result #2
pa

In [18]:

print(results)

[Document(metadata={}, page_content='INTER-HE 202'), Document(metadata={}, page_content='COMPSCI 570'), Document(metadata={}, page_content='AMERIND 405'), Document(metadata={}, page_content='ASIANAM 441'), Document(metadata={}, page_content='HEBR-MOD 310')]
