In [2]:
%pip install requests beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [beautifulsoup4]
[1A[2KSuccessfully installed beautifulsoup4-4.13.4 soupsieve-2.7
Note: you may need to restart the kernel to use updated packages.


## 1. Parse the Sitemap with Namespaces

In [7]:
import requests
import xml.etree.ElementTree as ET
import urllib3
import pandas as pd
from datetime import datetime

# Suppress SSL warnings for self-signed certificates
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

sitemap_index_url = 'https://ccs.ca/sitemap_index.xml'
response = requests.get(sitemap_index_url, verify=False)
root = ET.fromstring(response.content)

namespaces = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

# Step 1: Get all child sitemap URLs
sitemap_urls = [sitemap.find('ns:loc', namespaces).text.strip()
                for sitemap in root.findall('ns:sitemap', namespaces)]

all_urls = []

# Step 2: Loop through each sitemap and collect <url> entries
for sitemap_url in sitemap_urls:
    resp = requests.get(sitemap_url, verify=False)
    child_root = ET.fromstring(resp.content)
    for url in child_root.findall('ns:url', namespaces):
        loc = url.find('ns:loc', namespaces).text.strip()
        all_urls.append(loc)

# Preview the first few entries
for item in all_urls[:10]:
    print(item)

print()
print(f"Total de Urls: {len(all_urls)}")

https://ccs.ca/2021/07/02/ccs-hls-atherosclerosis-research-award/
https://ccs.ca/2021/07/02/ccs-hls-atherosclerosis-research-award/
https://ccs.ca/
https://ccs.ca/ccs-research-awards/ccs-covid-19-challenge-for-canada-initiative-ccs-c3i/
https://ccs.ca/ccs-research-awards/ccs-covid-19-challenge-for-canada-initiative-ccs-c3i/
https://ccs.ca/ccs-research-awards/ccs-bayer-resident-vascular-award/
https://ccs.ca/ccs-research-awards/ccs-hls-atherosclerosis-research-award/
https://ccs.ca/ccs-research-awards/ccs-hls-atherosclerosis-research-award/
https://ccs.ca/ccs-research-awards/ccs-bms-pfizer-af-research-award/
https://ccs.ca/ccs-research-awards/ccs-bms-pfizer-af-research-award/

Total de Urls: 1136


In [10]:
import pandas as pd

df = pd.DataFrame(all_urls, columns=['url'])
df.to_csv('ccs_scraped_all_urls.csv', index=False)


In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import ast
import time
from tqdm import tqdm
from datetime import datetime

# Load your input file (adjust filename as needed)
df = pd.read_csv('ccs_scraped_all_urls.csv')

#df = df[30:61]

def extract_url(raw):
    try:
        return ast.literal_eval(raw)['url']
    except:
        return raw

df['url'] = df['url'].apply(extract_url)

# Define selectors with human-readable labels
selectors = [
    ("main#main-content", ('main', {'id': 'main-content'})),
    ("div#masterContentArea", ('div', {'id': 'masterContentArea'})),
    ("div.hs_cos_wrapper_type_rich_text", ('div', {'class': 'hs_cos_wrapper_type_rich_text'})),
    ("article", ('article', {})),
]

# Scraper logic with tracking
def extract_page_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    for label, (tag, attrs) in selectors:
        section = soup.find(tag, attrs)
        if section:
            return section.get_text(separator="\n", strip=True), label

    if soup.body:
        return soup.body.get_text(separator="\n", strip=True), "body"

    return "[NO CONTENT FOUND]", "none"

# Scrape all URLs
results = []
today = datetime.now().isoformat(timespec='seconds')

for url in tqdm(df['url']):
    try:
        response = requests.get(url, timeout=10, verify=False)
        response.raise_for_status()
        content, selector = extract_page_content(response.text)
        status = "ok" if content.strip() and selector != "none" else "no_content"
    except Exception as e:
        content = f"[ERROR] {e}"
        selector = "error"
        status = "error"

    results.append({
        'url': url,
        'content': content,
        'selector_used': selector,
        'status': status,
        'client_id': client_id,
        "date": today  # Add current date here
    })

    time.sleep(1)  # Be polite

# Save output
output_df = pd.DataFrame(results)
output_df.to_csv('scraped_content_with_status.csv', index=False)

print("✅ Done. Results saved to 'scraped_content_with_status.csv'")


100%|██████████| 1136/1136 [25:48<00:00,  1.36s/it] 

✅ Done. Results saved to 'scraped_content_with_status.csv'





In [17]:
df = pd.read_csv("./scraped_content_with_status.csv")
df

Unnamed: 0,url,content,selector_used,status,client_id,date
0,https://ccs.ca/2021/07/02/ccs-hls-atherosclero...,Home\nCCS/HLS Atherosclerosis Research Award\n...,main#main-content,ok,ccs,2025-07-29T20:26:05
1,https://ccs.ca/2021/07/02/ccs-hls-atherosclero...,Home\nCCS/HLS Atherosclerosis Research Award\n...,main#main-content,ok,ccs,2025-07-29T20:26:05
2,https://ccs.ca/,Canadian Cardiovascular Society\nStrong heart ...,main#main-content,ok,ccs,2025-07-29T20:26:05
3,https://ccs.ca/ccs-research-awards/ccs-covid-1...,Home\nAbout\nAwards\nCCS Research Fellowships ...,main#main-content,ok,ccs,2025-07-29T20:26:05
4,https://ccs.ca/ccs-research-awards/ccs-covid-1...,Home\nAbout\nAwards\nCCS Research Fellowships ...,main#main-content,ok,ccs,2025-07-29T20:26:05
...,...,...,...,...,...,...
1131,https://ccs.ca/topic/vaccination-vaccination-p...,Skip to main content\nUtility Menu\nJoin Us\nN...,body,ok,ccs,2025-07-29T20:26:05
1132,https://ccs.ca/topic/vasculaire/,Skip to main content\nUtility Menu\nJoin Us\nN...,body,ok,ccs,2025-07-29T20:26:05
1133,https://ccs.ca/topic/vascular/,Skip to main content\nUtility Menu\nJoin Us\nN...,body,ok,ccs,2025-07-29T20:26:05
1134,https://ccs.ca/topic/women-in-cv-sciences/,Skip to main content\nUtility Menu\nJoin Us\nN...,body,ok,ccs,2025-07-29T20:26:05


In [18]:
df["selector_used"].value_counts()

selector_used
main#main-content    984
body                 151
error                  1
Name: count, dtype: int64

In [19]:
df["status"].value_counts()

status
ok       1135
error       1
Name: count, dtype: int64

In [20]:
# Filter rows where status is 'no_content'
error_content_urls = df[df['status'] == 'error']['url']

# Display them
error_content_urls


187    https://ccs.ca/kerr-award-past-recipients/
Name: url, dtype: object

In [21]:
# Filter rows where status is 'no_content'
no_content_urls = df[df['status'] == 'no_content']['url']

# Display them
no_content_urls

Series([], Name: url, dtype: object)

In [23]:
df[df["status"] == "ok"]["url"].to_csv("ccs_scraped_working_urls.csv", index=False)