In [1]:
from langchain_community.document_loaders import PyPDFLoader

In [2]:
import pymupdf # imports the pymupdf library
from langchain_core.documents import Document

def read_document():
  reader = pymupdf.open("guide.pdf") # open a document
  print("Checkpoint 1")
  docs = []
  for i, page in enumerate(reader):
    if i == 100 or i == 1000:
      print(i)
    text = page.get_text() # get plain text encoded as UTF-8
    docs.append(Document(page_content=text, metadata={"page": i}))
  
  return docs

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=5000,
        chunk_overlap=500,
        length_function=len,
        add_start_index=True,
    )
    
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    return chunks

In [None]:
d = read_document()
c = split_text(d)

5478


In [46]:
import time
from openai import RateLimitError
from supabase import create_client
from langchain_openai import OpenAIEmbeddings
import os
import getpass

# 1. Connect to Supabase
if not os.environ.get("SUPABASE_URL"):
    os.environ["SUPABASE_URL"] = getpass.getpass("Enter Supabase URL: ")
if not os.environ.get("SUPABASE_SERVICE_ROLE_KEY"):
    os.environ["SUPABASE_SERVICE_ROLE_KEY"] = getpass.getpass(
        "Enter Supabase Service Role Key: "
    )

url = os.environ["SUPABASE_URL"]
key = os.environ["SUPABASE_SERVICE_ROLE_KEY"]  # Or anon key if permissions allow
supabase = create_client(url, key)

# 2. Initialize embedding model
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

# 3. Loop over chunks and insert
for i, doc in enumerate(c):
    try:
        content = doc.page_content
        metadata = doc.metadata

        embedding = embedding_model.embed_query(doc.page_content)
    except RateLimitError as e:
        print("⚠️ Rate limited, waiting 1 second...")
        time.sleep(1)  # or slightly more
        # Optional: retry
        embedding = embedding_model.embed_query(doc.page_content)

        # 4. Insert into Supabase
    supabase.table("documents").insert(
        {"content": content, "metadata": metadata, "embedding": embedding}
    ).execute()

    if i % 50 == 0:
        print(f"Inserted {i+1}/{len(c)} chunks")

Inserted 1/2169 chunks
Inserted 51/2169 chunks
Inserted 101/2169 chunks
Inserted 151/2169 chunks
⚠️ Rate limited, waiting 1 second...
Inserted 201/2169 chunks
Inserted 251/2169 chunks
Inserted 301/2169 chunks
Inserted 351/2169 chunks
⚠️ Rate limited, waiting 1 second...
⚠️ Rate limited, waiting 1 second...
Inserted 401/2169 chunks
Inserted 451/2169 chunks
Inserted 501/2169 chunks
Inserted 551/2169 chunks
⚠️ Rate limited, waiting 1 second...
⚠️ Rate limited, waiting 1 second...
Inserted 601/2169 chunks
Inserted 651/2169 chunks
⚠️ Rate limited, waiting 1 second...
Inserted 701/2169 chunks
⚠️ Rate limited, waiting 1 second...
⚠️ Rate limited, waiting 1 second...
⚠️ Rate limited, waiting 1 second...
⚠️ Rate limited, waiting 1 second...
Inserted 751/2169 chunks
⚠️ Rate limited, waiting 1 second...
Inserted 801/2169 chunks
⚠️ Rate limited, waiting 1 second...
Inserted 851/2169 chunks
⚠️ Rate limited, waiting 1 second...
Inserted 901/2169 chunks
⚠️ Rate limited, waiting 1 second...
⚠️ Rate li

KeyboardInterrupt: 

In [78]:
import csv

from httpx import HTTPStatusError
from openai import RateLimitError
import traceback
import json 

#check if the data folder exists, else create it
def check_and_create_folder(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_path}' created.")
    else:
        print(f"Folder '{folder_path}' already exists.")
check_and_create_folder('./batch_files')

#create the batch files
start_index = 1054
model_name = "text-embedding-3-small"
batch_size = 20000
batch_content  =  c[start_index:]
batch_file_name = 'guide_batch'
num_files = len(batch_content) // batch_size + (1 if len(batch_content) % batch_size != 0 else 0)

for num_file in range(num_files):
    output_file = f'./batch_files/{batch_file_name}_part{num_file}.jsonl'

    # make sure that the file does not exist, so don't add to an existing file
    if os.path.exists(output_file):
        os.remove(output_file)
    #write each embedding entry to a new line 
    with open(output_file, 'a') as file:
        for i, doc in enumerate(batch_content[batch_size * num_file : batch_size * (num_file + 1)]): 
            index = start_index + batch_size * num_file + i
        
            payload = {
                "custom_id":f"custom_id_{index}",
                "method": "POST",
                "url": "/v1/embeddings",
                "body": {
                    "input": doc.page_content,
                    "model": model_name,
                    "encoding_format": "float",
                    'dimensions':1024
                }
            }
            file.write(json.dumps(payload) + '\n')

    # Sanity check, print the first 2 lines
    with open(output_file, 'r') as file:
        for line in file.readlines()[:2]:
            print(line)

Folder './batch_files' created.
{"custom_id": "custom_id_1054", "method": "POST", "url": "/v1/embeddings", "body": {"input": "study of politics, economics, security, and culture. The goal is to provide\nstudents with the necessary tools to understand global processes in their\ntotality and how they are situated and lived in specific regions.\nThe major provides an integrated program of courses that lays the\nfoundation for professional training in a wide variety of areas. Such a\nfoundation can be invaluable in securing a place in competitive graduate\nor professional schools, which, in turn, prepare students for government\nservice, or for other careers with an international focus, including those\nin multinational corporations, international finance, non-governmental\norganizations, and institutions of teaching and research.\nThe International Studies major complements numerous majors across\ncampus. Many students choose to double major or enhance their studies", "model": "text-embed

In [90]:
input_path = './batch_files/guide_batch_part0.jsonl'
output_dir = './batch_files/split'
os.makedirs(output_dir, exist_ok=True)

# Read all lines from original JSONL file
with open(input_path, 'r') as f:
    lines = f.readlines()

# Split in half
mid = len(lines) // 2
parts = [lines[:mid], lines[mid:]]

# Write to new files
for i, part in enumerate(parts):
    output_path = os.path.join(output_dir, f'guide_batch_part0_split{i}.jsonl')
    with open(output_path, 'w') as f:
        f.writelines(part)
    print(f"✅ Wrote {len(part)} lines to {output_path}")

✅ Wrote 557 lines to ./batch_files/split/guide_batch_part0_split0.jsonl
✅ Wrote 558 lines to ./batch_files/split/guide_batch_part0_split1.jsonl


In [93]:
import os
import getpass
from openai import OpenAI

# Set your OpenAI API key safely
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI Key: ")

client = OpenAI()

# Folder containing batch JSONL files
batch_folder = "./batch_files"
batch_input_files = []

# Upload all batch files to OpenAI
for file_name in os.listdir(batch_folder):
    file_path = os.path.join(batch_folder, file_name)
    if file_name.endswith(".jsonl"):
        print(f"📤 Uploading {file_name}...")
        uploaded_file = client.files.create(file=open(file_path, "rb"), purpose="batch")
        batch_input_files.append(uploaded_file)

# Create batch jobs
job_creations = []
for i, uploaded_file in enumerate(batch_input_files):
    print(f"🛠️ Creating batch job for {uploaded_file.filename}...")
    job = client.batches.create(
        input_file_id=uploaded_file.id,
        endpoint="/v1/embeddings",
        completion_window="24h",  # only valid option currently
        metadata={"description": f"guide_embeddings_part_{i}"},
    )
    job_creations.append(job)

# Print job details
print("\n✅ Batch Jobs Created:")
for job in job_creations:
    print(
        f"📦 ID: {job.id} | Status: {job.status} | File: {job.metadata['description']}"
    )

📤 Uploading guide_batch_part0_split1.jsonl...
📤 Uploading guide_batch_part0_split0.jsonl...
🛠️ Creating batch job for guide_batch_part0_split1.jsonl...
🛠️ Creating batch job for guide_batch_part0_split0.jsonl...

✅ Batch Jobs Created:
📦 ID: batch_6872cd80b8c8819091857db32c3a8806 | Status: validating | File: guide_embeddings_part_0
📦 ID: batch_6872cd80ff408190b3ada6ce6b43f12d | Status: validating | File: guide_embeddings_part_1


In [None]:
file = client.files.create(
    file=open('./batch_files/guide_batch_part0_split0.jsonl', 'rb'),  # or split0 path
    purpose='batch'
)
batch_job = client.batches.create(
    input_file_id=file.id,
    endpoint="/v1/embeddings",
    completion_window="24h",
    metadata={
        "description": "guide_embeddings_part0_split0_resubmit2"
    }
)
print(f"✅ Resubmitted: {batch_job.id} | Status: {batch_job.status}")

✅ Resubmitted: batch_6872d2f3648c819092435b9a515215dd | Status: validating


In [106]:
batches = client.batches.list(limit=10)
for batch in batches:
    print(f"{batch.id} | {batch.status} | {batch.metadata.get('description')} | {batch.usage.input_tokens}")
   

AttributeError: 'Batch' object has no attribute 'usage'

In [101]:
import tiktoken, json

enc = tiktoken.encoding_for_model("text-embedding-3-small")

def estimate_tokens_in_file(path):
    total = 0
    with open(path, 'r') as f:
        for line in f:
            body = json.loads(line)["body"]
            total += len(enc.encode(body["input"]))
    return total

print(estimate_tokens_in_file("./batch_files/guide_batch_part0_split0.jsonl"))

544527


In [None]:
#WAIT UNTIL ATLEAST 5pm tomorrow before we can then batch only file guide_batch_part0_split0, part0_split1 is already done.