In [2]:
import pandas as pd
import pickle

In [4]:
with open(r"C:\Users\Lenovo\Downloads\all_records.pkl", "rb") as f:
    all_records = pickle.load(f)

In [5]:
df = pd.DataFrame(all_records)
df.head()

Unnamed: 0,findings,impression,image_base64,image_embedding,text_embedding
0,"The lungs are clear of focal consolidation, pl...",No acute cardiopulmonary process.,iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA...,"[-0.20263421535491943, 0.14284241199493408, -0...","[-0.24461627006530762, 1.1459405422210693, -1...."
1,Lung volumes remain low. There are innumerable...,Low lung volumes and mild pulmonary vascular c...,iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA...,"[-0.21884994208812714, 0.06916213035583496, 0....","[0.09711940586566925, 0.14580783247947693, 0.1..."
2,Lung volumes are low. This results in crowding...,Innumerable pulmonary metastases. Possible mil...,iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA...,"[-0.14355027675628662, 0.06423193216323853, 0....","[0.16692262887954712, 0.4082597494125366, 0.24..."
3,In comparison to study performed on of there i...,New mild pulmonary edema with persistent small...,iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA...,"[-0.1008208766579628, -0.06110411882400513, 0....","[0.9856616258621216, 0.0934327244758606, -0.82..."
4,The right costophrenic angle is not imaged. Ot...,An enteric tube courses below the level of the...,iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA...,"[-0.23267525434494019, 0.06331181526184082, -0...","[0.4501332640647888, 1.5415682792663574, -1.37..."


In [6]:
## deleting the variable all_records to free up memory
del all_records

In [7]:
## storing first 1000 records in a dictionary
data_dict = df.head(1000).to_dict(orient="records")

In [None]:
import os
import psycopg2
from psycopg2.extras import execute_batch
from tqdm.auto import tqdm
import numpy as np

# Use the connection string you already have
SUPABASE_CONNECTION_STRING = ""

TABLE_NAME = "public.radiology_report"
BATCH_SIZE = 100  # tune by memory/network

def emb_to_pgvector_literal(emb):
    # Ensure float32 for compact representation
    arr = np.asarray(emb, dtype=np.float32)
    # Create pgvector literal string like: '[0.1,0.2,...]'
    return "[" + ",".join(map(str, arr.tolist())) + "]"

def insert_records_direct(all_records, batch_size=BATCH_SIZE):
    conn = psycopg2.connect(SUPABASE_CONNECTION_STRING)
    cur = conn.cursor()
    sql = f"""
    INSERT INTO {TABLE_NAME} (findings, impression, image_base64, image_embedding, text_embedding)
    VALUES (%s, %s, %s, %s::vector, %s::vector)
    """
    try:
        total = len(all_records)
        for i in tqdm(range(0, total, batch_size), desc="Inserting batches"):
            batch = all_records[i:i+batch_size]
            args = []
            for rec in batch:
                findings = rec.get("findings")
                impression = rec.get("impression")
                image_base64 = rec.get("image_base64")
                # convert embeddings to pgvector literal strings
                image_emb = emb_to_pgvector_literal(rec["image_embedding"])
                text_emb = emb_to_pgvector_literal(rec["text_embedding"])
                args.append((findings, impression, image_base64, image_emb, text_emb))
            execute_batch(cur, sql, args)
            conn.commit()
        print("All rows inserted.")
    finally:
        cur.close()
        conn.close()

# Run:
insert_records_direct(data_dict)

Inserting batches:   0%|          | 0/10 [00:00<?, ?it/s]

All rows inserted.
