# Generate chunks

In [1]:
import pandas as pd

In [3]:
df_full_content = pd.read_csv("./scraped_content_with_status.csv")
df_full_content

Unnamed: 0,url,content,selector_used,status,client_id,date
0,https://ccs.ca/2021/07/02/ccs-hls-atherosclero...,Home\nCCS/HLS Atherosclerosis Research Award\n...,main#main-content,ok,ccs,2025-07-29T20:26:05
1,https://ccs.ca/2021/07/02/ccs-hls-atherosclero...,Home\nCCS/HLS Atherosclerosis Research Award\n...,main#main-content,ok,ccs,2025-07-29T20:26:05
2,https://ccs.ca/,Canadian Cardiovascular Society\nStrong heart ...,main#main-content,ok,ccs,2025-07-29T20:26:05
3,https://ccs.ca/ccs-research-awards/ccs-covid-1...,Home\nAbout\nAwards\nCCS Research Fellowships ...,main#main-content,ok,ccs,2025-07-29T20:26:05
4,https://ccs.ca/ccs-research-awards/ccs-covid-1...,Home\nAbout\nAwards\nCCS Research Fellowships ...,main#main-content,ok,ccs,2025-07-29T20:26:05
...,...,...,...,...,...,...
1131,https://ccs.ca/topic/vaccination-vaccination-p...,Skip to main content\nUtility Menu\nJoin Us\nN...,body,ok,ccs,2025-07-29T20:26:05
1132,https://ccs.ca/topic/vasculaire/,Skip to main content\nUtility Menu\nJoin Us\nN...,body,ok,ccs,2025-07-29T20:26:05
1133,https://ccs.ca/topic/vascular/,Skip to main content\nUtility Menu\nJoin Us\nN...,body,ok,ccs,2025-07-29T20:26:05
1134,https://ccs.ca/topic/women-in-cv-sciences/,Skip to main content\nUtility Menu\nJoin Us\nN...,body,ok,ccs,2025-07-29T20:26:05


In [4]:
df_full_content = df_full_content[df_full_content['status'] == 'ok']
df_full_content 

Unnamed: 0,url,content,selector_used,status,client_id,date
0,https://ccs.ca/2021/07/02/ccs-hls-atherosclero...,Home\nCCS/HLS Atherosclerosis Research Award\n...,main#main-content,ok,ccs,2025-07-29T20:26:05
1,https://ccs.ca/2021/07/02/ccs-hls-atherosclero...,Home\nCCS/HLS Atherosclerosis Research Award\n...,main#main-content,ok,ccs,2025-07-29T20:26:05
2,https://ccs.ca/,Canadian Cardiovascular Society\nStrong heart ...,main#main-content,ok,ccs,2025-07-29T20:26:05
3,https://ccs.ca/ccs-research-awards/ccs-covid-1...,Home\nAbout\nAwards\nCCS Research Fellowships ...,main#main-content,ok,ccs,2025-07-29T20:26:05
4,https://ccs.ca/ccs-research-awards/ccs-covid-1...,Home\nAbout\nAwards\nCCS Research Fellowships ...,main#main-content,ok,ccs,2025-07-29T20:26:05
...,...,...,...,...,...,...
1131,https://ccs.ca/topic/vaccination-vaccination-p...,Skip to main content\nUtility Menu\nJoin Us\nN...,body,ok,ccs,2025-07-29T20:26:05
1132,https://ccs.ca/topic/vasculaire/,Skip to main content\nUtility Menu\nJoin Us\nN...,body,ok,ccs,2025-07-29T20:26:05
1133,https://ccs.ca/topic/vascular/,Skip to main content\nUtility Menu\nJoin Us\nN...,body,ok,ccs,2025-07-29T20:26:05
1134,https://ccs.ca/topic/women-in-cv-sciences/,Skip to main content\nUtility Menu\nJoin Us\nN...,body,ok,ccs,2025-07-29T20:26:05


In [5]:
unique_url_count = df_full_content['url'].nunique()
unique_url_count

761

In [6]:
from tqdm import tqdm
import pandas as pd
from datetime import datetime
from langchain_text_splitters import RecursiveCharacterTextSplitter


# Assuming these constants are already defined
TOKENIZER_MODEL_NAME = "gpt-4o"
CHUNK_SIZE = 1024
CHUNK_OVERLAP = 100
today = datetime.now().isoformat(timespec='seconds')
output_chunks_csv_file = "url_chunked_markdown.csv"

chunked_rows = []
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name=TOKENIZER_MODEL_NAME,
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
)

for _, row in tqdm(df_full_content.iterrows(), total=df_full_content.shape[0], desc="Chunking content"):
    url = row["url"]
    content_text = row["content"]
    client_id = row["client_id"]
    chunks = text_splitter.split_text(content_text)
    for idx, chunk in enumerate(chunks):
        chunked_rows.append({
            "url": url,
            "chunk_id": idx + 1,
            "chunk_text": chunk,
            "client_id": client_id,
            "date": today  # Assuming you have a `today` variable defined
        })

df_chunks = pd.DataFrame(chunked_rows)
df_chunks.to_csv(output_chunks_csv_file, index=False)
print(f"[✓] Chunked content mapping saved to: {output_chunks_csv_file}")


Chunking content: 100%|██████████| 1135/1135 [00:01<00:00, 724.82it/s]

[✓] Chunked content mapping saved to: url_chunked_markdown.csv





In [7]:
df_chunk = pd.read_csv("./url_chunked_markdown.csv")
df_chunk

Unnamed: 0,url,chunk_id,chunk_text,client_id,date
0,https://ccs.ca/2021/07/02/ccs-hls-atherosclero...,1,Home\nCCS/HLS Atherosclerosis Research Award\n...,ccs,2025-07-29T20:54:23
1,https://ccs.ca/2021/07/02/ccs-hls-atherosclero...,1,Home\nCCS/HLS Atherosclerosis Research Award\n...,ccs,2025-07-29T20:54:23
2,https://ccs.ca/,1,Canadian Cardiovascular Society\nStrong heart ...,ccs,2025-07-29T20:54:23
3,https://ccs.ca/ccs-research-awards/ccs-covid-1...,1,Home\nAbout\nAwards\nCCS Research Fellowships ...,ccs,2025-07-29T20:54:23
4,https://ccs.ca/ccs-research-awards/ccs-covid-1...,2,CCS funds will be dispensed in 1 installment.\...,ccs,2025-07-29T20:54:23
...,...,...,...,...,...
1655,https://ccs.ca/topic/vaccination-vaccination-p...,1,Skip to main content\nUtility Menu\nJoin Us\nN...,ccs,2025-07-29T20:54:23
1656,https://ccs.ca/topic/vasculaire/,1,Skip to main content\nUtility Menu\nJoin Us\nN...,ccs,2025-07-29T20:54:23
1657,https://ccs.ca/topic/vascular/,1,Skip to main content\nUtility Menu\nJoin Us\nN...,ccs,2025-07-29T20:54:23
1658,https://ccs.ca/topic/women-in-cv-sciences/,1,Skip to main content\nUtility Menu\nJoin Us\nN...,ccs,2025-07-29T20:54:23


In [8]:
unique_chunk_url_count = df_chunk['url'].nunique()
unique_chunk_url_count

761

# Generate context for each chunk

In [9]:
from openai import OpenAI
from dotenv import load_dotenv
import os

client = OpenAI()

# Load environment variables (e.g. from a .env file)
load_dotenv()

# Retrieve your API key (ensure SUPABASE_KEY / OPENAI_API_KEY is set appropriately)
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not set in environment variables.")

In [13]:
import pandas as pd
from tqdm import tqdm
from openai import OpenAI

# Initialize OpenAI client
client = OpenAI()

# Merge the full content into the df_chunk based on 'url'
df_merged = pd.merge(df_chunk, df_full_content[['url', 'content']], on='url', how='left')


# Function to make the OpenAI API call
def get_context(full_doc, chunk_text):
    prompt = f"""
Document:
{full_doc}

Here is the chunk we want to situate within the whole document.
Chunk:
{chunk_text}

Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk.
Answer only with the succinct context and nothing else.
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4.1-nano",
            messages=[
                {"role": "system", "content": "You are an expert at summarizing contexts for search retrieval."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error with OpenAI API: {e}")
        return None

# Apply the function to each row with progress bar
enriched_contexts = []

for _, row in tqdm(df_chunk.iterrows(), total=df_chunk.shape[0], desc="Enriching chunks"):
    # Lookup full document content for this row
    full_content = df_full_content.loc[df_full_content['url'] == row['url'], 'content'].values
    full_content = full_content[0] if len(full_content) > 0 else ""
    chunk = row['chunk_text']
    enriched_context = get_context(full_content, chunk)
    
    if enriched_context is not None:
        final_text = f"{enriched_context} --- {chunk}"
    else:
        final_text = f"ERROR --- {chunk}"
    
    enriched_contexts.append(final_text)

df_chunk['chunk_text_with_context'] = enriched_contexts
df_chunk.to_csv("url_chunked_enriched.csv", index=False)
print("[✓] Enriched df_chunk saved to 'url_chunked_enriched.csv'")

Enriching chunks: 100%|██████████| 1660/1660 [31:48<00:00,  1.15s/it] 

[✓] Enriched df_chunk saved to 'url_chunked_enriched.csv'





In [29]:
df_chunk_with_context = pd.read_csv("./url_chunked_enriched.csv")
df_chunk_with_context

Unnamed: 0,url,chunk_id,chunk_text,client_id,date,chunk_text_with_context
0,https://ccs.ca/2021/07/02/ccs-hls-atherosclero...,1,Home\nCCS/HLS Atherosclerosis Research Award\n...,ccs,2025-07-29T20:54:23,This chunk provides an overview of the CCS/HLS...
1,https://ccs.ca/2021/07/02/ccs-hls-atherosclero...,1,Home\nCCS/HLS Atherosclerosis Research Award\n...,ccs,2025-07-29T20:54:23,This chunk provides an overview of the CCS/HLS...
2,https://ccs.ca/,1,Canadian Cardiovascular Society\nStrong heart ...,ccs,2025-07-29T20:54:23,Overview of the Canadian Cardiovascular Societ...
3,https://ccs.ca/ccs-research-awards/ccs-covid-1...,1,Home\nAbout\nAwards\nCCS Research Fellowships ...,ccs,2025-07-29T20:54:23,This chunk provides detailed information about...
4,https://ccs.ca/ccs-research-awards/ccs-covid-1...,2,CCS funds will be dispensed in 1 installment.\...,ccs,2025-07-29T20:54:23,"Details on Phase II funding, progress reportin..."
...,...,...,...,...,...,...
1655,https://ccs.ca/topic/vaccination-vaccination-p...,1,Skip to main content\nUtility Menu\nJoin Us\nN...,ccs,2025-07-29T20:54:23,This chunk represents the main navigational me...
1656,https://ccs.ca/topic/vasculaire/,1,Skip to main content\nUtility Menu\nJoin Us\nN...,ccs,2025-07-29T20:54:23,This chunk represents the main navigational me...
1657,https://ccs.ca/topic/vascular/,1,Skip to main content\nUtility Menu\nJoin Us\nN...,ccs,2025-07-29T20:54:23,This chunk represents the main navigational he...
1658,https://ccs.ca/topic/women-in-cv-sciences/,1,Skip to main content\nUtility Menu\nJoin Us\nN...,ccs,2025-07-29T20:54:23,This chunk represents the main navigational he...


In [30]:
df_chunk_with_context["chunk_text_with_context"].iloc[0]


'This chunk provides an overview of the CCS/HLS Atherosclerosis Research Award, including its purpose, eligibility criteria, application process, award details, and review procedures, as described in the official call for applications and related guidelines. --- Home\nCCS/HLS Atherosclerosis Research Award\nThe Canadian Cardiovascular Society and HLS Therapeutics are proud to support the CCS/HLS Atherosclerosis Research Award. This research award will support innovative groundbreaking clinical research projects in the area of atherosclerosis. Priority will be given to research that has the potential to advance clinical knowledge of atherosclerosis process linked to inflammation, membrane stability and biomarkers.\nThis Award is intended for Canadian MDs in the first 5 years of their first clinical appointment in an academic department for a research project on the topic of atherosclerosis.\nA single award of $35,000 will be awarded based on an independent peer review process.\n2022 Cal

# Upload chunks to Supabase with Contextual Embedding (Anthropic)

In [18]:
import pandas as pd
from openai import OpenAI
from supabase import create_client, Client
from tqdm import tqdm

# Load environment variables (e.g. from a .env file)
load_dotenv()

# Retrieve your API key (ensure SUSUPABASE_URL is set appropriately)
SUPABASE_URL = os.getenv("SUPABASE_URL")
if not SUPABASE_URL:
    raise ValueError("SUPABASE_URL not set in environment variables.")

# Retrieve your API key (ensure SUPABASE_KEY is set appropriately)
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
if not SUPABASE_KEY:
    raise ValueError("SUPABASE_KEY not set in environment variables.")


supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

In [28]:
import pandas as pd
from tqdm import tqdm

# Load your DataFrame
df = pd.read_csv("url_chunked_enriched.csv")

# Store successful uploads
uploaded_records = []

# Process each row
for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Uploading embeddings"):
    chunk_text_with_context = row['chunk_text_with_context']
    client_id = row["client_id"]
    
    # Create metadata
    metadata = {
        "url": row['url'],
        "chunk_id": row['chunk_id'],
        "client_id":row["client_id"],
        "date": row['date']
    }
    
    # Generate embedding
    try:
        response = client.embeddings.create(
            input=[chunk_text_with_context],
            model="text-embedding-3-small"
        )
        embedding = response.data[0].embedding
    except Exception as e:
        print(f"Embedding failed: {e}")
        continue  # skip to next row

    # Insert into Supabase
    try:
        data = {
            "content": chunk_text_with_context,
            "metadata": metadata,
            "embedding": embedding,
            "client_id": client_id
        }
        supabase.table("documents").insert(data).execute()
        
        # Save record locally after successful insert
        uploaded_record = {
            "url": row['url'],
            "chunk_id": row['chunk_id'],
            "date": row['date'],
            "chunk_text_with_context": chunk_text_with_context,
            "metadata": metadata,
            "embedding": embedding,
            "client_id": client_id
        }
        uploaded_records.append(uploaded_record)
        
    except Exception as e:
        print(f"Supabase insert failed: {e}")
        continue

# ✅ After loop ends, save the local copy:
if uploaded_records:
    uploaded_df = pd.DataFrame(uploaded_records)
    uploaded_df.to_csv("uploaded_chunks_embedding.csv", index=False)
    print("[✓] Local copy saved to 'uploaded_chunks_embedding.csv'")
else:
    print("[!] No records uploaded, no CSV created.")


Uploading embeddings: 100%|██████████| 1660/1660 [09:09<00:00,  3.02it/s] 


[✓] Local copy saved to 'uploaded_chunks_embedding.csv'


# URL report

In [20]:
!pip install anytree



In [21]:
df_report = pd.read_csv("./url_chunked_enriched.csv")
df_report

Unnamed: 0,url,chunk_id,chunk_text,client_id,date,chunk_text_with_context
0,https://ccs.ca/2021/07/02/ccs-hls-atherosclero...,1,Home\nCCS/HLS Atherosclerosis Research Award\n...,ccs,2025-07-29T20:54:23,This chunk provides an overview of the CCS/HLS...
1,https://ccs.ca/2021/07/02/ccs-hls-atherosclero...,1,Home\nCCS/HLS Atherosclerosis Research Award\n...,ccs,2025-07-29T20:54:23,This chunk provides an overview of the CCS/HLS...
2,https://ccs.ca/,1,Canadian Cardiovascular Society\nStrong heart ...,ccs,2025-07-29T20:54:23,Overview of the Canadian Cardiovascular Societ...
3,https://ccs.ca/ccs-research-awards/ccs-covid-1...,1,Home\nAbout\nAwards\nCCS Research Fellowships ...,ccs,2025-07-29T20:54:23,This chunk provides detailed information about...
4,https://ccs.ca/ccs-research-awards/ccs-covid-1...,2,CCS funds will be dispensed in 1 installment.\...,ccs,2025-07-29T20:54:23,"Details on Phase II funding, progress reportin..."
...,...,...,...,...,...,...
1655,https://ccs.ca/topic/vaccination-vaccination-p...,1,Skip to main content\nUtility Menu\nJoin Us\nN...,ccs,2025-07-29T20:54:23,This chunk represents the main navigational me...
1656,https://ccs.ca/topic/vasculaire/,1,Skip to main content\nUtility Menu\nJoin Us\nN...,ccs,2025-07-29T20:54:23,This chunk represents the main navigational me...
1657,https://ccs.ca/topic/vascular/,1,Skip to main content\nUtility Menu\nJoin Us\nN...,ccs,2025-07-29T20:54:23,This chunk represents the main navigational he...
1658,https://ccs.ca/topic/women-in-cv-sciences/,1,Skip to main content\nUtility Menu\nJoin Us\nN...,ccs,2025-07-29T20:54:23,This chunk represents the main navigational he...


In [22]:
from anytree import Node, RenderTree
from urllib.parse import urlparse

# Build nodes for each path
root = Node("root")
nodes = {}

for segments in df_report['url'].apply(lambda url: urlparse(url).path.strip("/").split("/")):
    current = root
    path = ""
    for seg in segments:
        path += "/" + seg
        if path not in nodes:
            nodes[path] = Node(seg, parent=current)
        current = nodes[path]

# Print tree
for pre, fill, node in RenderTree(root):
    print(f"{pre}{node.name}")

root
├── 2021
│   └── 07
│       └── 02
│           └── ccs-hls-atherosclerosis-research-award
├── 
├── ccs-research-awards
│   ├── ccs-covid-19-challenge-for-canada-initiative-ccs-c3i
│   ├── ccs-bayer-resident-vascular-award
│   ├── ccs-hls-atherosclerosis-research-award
│   ├── ccs-bms-pfizer-af-research-award
│   ├── ccs-pfizer-heart-failure-disparities-in-indigenous-communities-research-fellowship-award
│   ├── ccs-bms-hypertrophic-cardiomyopathy-hcm-research-award
│   ├── ccs-novonordisk-cardiometabolic-research-award
│   ├── ccs-pfizer-chf-alliance-research-fellowship-award-in-first-nations-inuit-and-metisfnim-communities-experiencing-heart-function-hf-inequities-handbook
│   ├── ccs-pfizer-attr-cm-research-trainee-award_handbook
│   └── ccs-pfizer-attr-cm-research-trainee-award
├── canadian-pediatric-cardiology-association
├── bursaries-grants-and-awards
├── calculators-and-forms
├── c3i-dashboard
├── canadian-cardiac-transplant-network
├── canadian-cardiovascular-congress-cong

In [23]:
from anytree import Node, RenderTree
from urllib.parse import urlparse
from collections import defaultdict, Counter

# Step 1: Extract segments
df_report['segments'] = df_report['url'].apply(lambda url: [seg for seg in urlparse(url).path.strip("/").split("/") if seg])

# Step 2: Build tree with counts
root = Node("root")
nodes = {"": root}
url_counts = Counter()

for seg_list in df_report['segments']:
    path = ""
    current = root
    for seg in seg_list:
        path += "/" + seg
        url_counts[path] += 1
        if path not in nodes:
            nodes[path] = Node(f"{seg}", parent=current)
        current = nodes[path]

# Step 3: Add counts to node names (and optionally limit children)
for path, node in nodes.items():
    if path != "":
        node.name = f"{node.name} ({url_counts[path]})"

# Optional: limit child expansion (e.g., top 5 only)
def prune_tree(node, top_n=5):
    if len(node.children) > top_n:
        # Sort children by count descending
        children = sorted(node.children, key=lambda x: int(x.name.split("(")[-1].strip(")")), reverse=True)
        for child in children[top_n:]:
            child.parent = None  # Detach from tree

    for child in node.children:
        prune_tree(child, top_n=top_n)

# Prune below root or any level you want
prune_tree(root, top_n=5)

# Step 4: Render
for pre, fill, node in RenderTree(root):
    print(f"{pre}{node.name}")


root
├── event (177)
│   ├── af-webinar-3-focus-on-screening-and-prevention (2)
│   ├── lipids-atherosclerosis-when-close-is-not-enough (2)
│   ├── canadian-cardiovascular-congress-2021 (2)
│   ├── canadian-perspectives-on-the-esc-2021 (2)
│   └── perspectives-in-clinical-cardiology-i-do-not-want-a-stroke-conundrums-in-anticoagulation-paroxysmal-atrial-fibrillation (2)
├── news (299)
│   ├── new-approach-to-heart-failure-therapy-upends-conventional-treatment-saves-lives (4)
│   ├── top-10-takeaways-lipids (6)
│   ├── sweeping-dyslipidemia-guidelines-urge-improved-screening-new-medications-and-lifestyle-counselling-but-hold-the-fish-oil-supplements (6)
│   ├── celebrating-asian-heritage-month-honoring-diversity-in-cardiovascular-care (6)
│   └── empowering-those-at-greatest-risk-of-heart-failure-to-pursue-testing-and-treatment (6)
├── guideline (455)
│   ├── 2021-lipids (33)
│   │   ├── chapter-1-introduction (2)
│   │   ├── chapter-2-definitions (1)
│   │   ├── chapter-6-conclusions (1

In [24]:
summary = (
    df_report['segments']
    .apply(lambda x: x[0] if x else 'root')
    .value_counts()
    .reset_index()
)
summary.columns = ['Top-Level Path', 'Total URLs']
summary['% of Total'] = (summary['Total URLs'] / summary['Total URLs'].sum() * 100).round(2)
summary


Unnamed: 0,Top-Level Path,Total URLs,% of Total
0,guideline,455,27.41
1,news,299,18.01
2,event,177,10.66
3,topic,69,4.16
4,news_topic,60,3.61
...,...,...,...
238,welcome-to-the-2022-virtual-cardiac-surgery-tr...,1,0.06
239,2021-pan-ontario-residents-bootcamp,1,0.06
240,2021-virtual-ccs-ccsa-pediatric-trainee-review...,1,0.06
241,membership-status-english,1,0.06


In [26]:
top_6 = summary[:6]
top_6

Unnamed: 0,Top-Level Path,Total URLs,% of Total
0,guideline,455,27.41
1,news,299,18.01
2,event,177,10.66
3,topic,69,4.16
4,news_topic,60,3.61
5,fr,36,2.17


In [None]:
top_6.to_csv("ccs_url_top_6_summary.csv", index=False)