# Transcript Extractor

## CoreIM

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [20]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/114.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Referer": "https://www.google.com/"
}

session = requests.Session()

def get_episode_links(max_pg=24):
    all_links = []
    for page in range(1, max_pg):
        url = f"https://www.coreimpodcast.com/category/episodes-by-topic/page/{page}/"
        print(f"Scraping page: {url}")
        res = session.get(url, headers=headers)
        if res.status_code != 200:
            print(f"Warning: Received status code {res.status_code} for page {page}")
            continue
        soup = BeautifulSoup(res.text, "html.parser")
        links = [a['href'] for a in soup.select("h2.entry-title a")]
        all_links.extend(links)
        time.sleep(1.5)
    return all_links

In [21]:
links = get_episode_links(2)

Scraping page: https://www.coreimpodcast.com/category/episodes-by-topic/page/1/


In [22]:
print(links)

[]


In [24]:
url = f"https://www.coreimpodcast.com/category/episodes-by-topic/page/{1}/"
session = requests.Session()
res = session.get(url, headers=headers)

In [26]:
print(res.status_code)
print(res.url)
print(res.text[:500])  # Peek at first 500 chars to see what it returns

202
https://www.coreimpodcast.com/category/episodes-by-topic/page/1/
<html><head><link rel="icon" href="data:;"><meta http-equiv="refresh" content="0;/.well-known/sgcaptcha/?r=%2Fcategory%2Fepisodes-by-topic%2Fpage%2F1%2F&y=ipc:99.106.63.80:1747424053.258"></meta></head></html>


In [28]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import pandas as pd

def get_episode_links_selenium(num_pages=23):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=options)

    base_url = "https://www.coreimpodcast.com/category/episodes-by-topic/page/"

    all_links = []

    for page_num in range(1, num_pages + 1):
        url = f"{base_url}{page_num}/"
        print(f"Loading page: {url}")
        driver.get(url)
        time.sleep(3)  # wait for page to load fully

        # Find all episode links - they are inside h2 elements with class "entry-title"
        episode_elements = driver.find_elements(By.CSS_SELECTOR, "h2.entry-title a")
        page_links = [elem.get_attribute("href") for elem in episode_elements]

        print(f"Found {len(page_links)} episodes on page {page_num}")
        all_links.extend(page_links)

    driver.quit()
    return all_links


In [29]:
links = get_episode_links_selenium(2)

Loading page: https://www.coreimpodcast.com/category/episodes-by-topic/page/1/
Found 0 episodes on page 1
Loading page: https://www.coreimpodcast.com/category/episodes-by-topic/page/2/
Found 0 episodes on page 2


In [30]:
links

[]