In [1]:
import time
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from chromedriver_py import binary_path
import pandas as pd
import requests

MAX_PAGES = 300
DELAY = 1
chrome_opts = webdriver.ChromeOptions()
chrome_opts.add_argument('--headless')
chrome_opts.add_argument('--no-sandbox')
chrome_opts.add_argument('--disable-dev-shm-usage')
chrome_opts.add_argument("window-size=1900,800")

def setup_browser():
    service = Service(binary_path)
    driver = webdriver.Chrome(service=service, options=chrome_opts)
    return driver

browser = setup_browser()

def landing_page():
    url = 'https://www.ted.com/talks?sort=newest'
    print("Opening TED homepage...")
    browser.get(url)
    time.sleep(DELAY * 3)
    try:
        cookie_button = browser.find_element(By.ID, 'onetrust-accept-btn-handler')
        cookie_button.click()
    except Exception as e:
        print(f"No cookie prompt to handle: {e}")
    time.sleep(DELAY)

landing_page()

def fetch_metadata(limit):
    results = []
    for page in tqdm(range(limit), desc="Fetching metadata"):
        payload = [{"indexName": "newest", "params": {"hitsPerPage": 24, "page": page}}]
        try:
            response = requests.post(
                'https://zenith-prod-alt.ted.com/api/search',
                headers={'Content-type': 'application/json'},
                json=payload
            )
            if response.status_code == 200:
                data = response.json().get('results', [])[0].get("hits", [])
                results.extend(data)
            else:
                print(f"Failed page {page}: {response.status_code}")
        except Exception as err:
            print(f"Error fetching page {page}: {err}")
        time.sleep(DELAY)
    return results

metadata = fetch_metadata(MAX_PAGES)


def process_metadata(raw_data):
    processed = []
    for item in raw_data:
        slug = item.get("slug", "N/A")
        processed.append({
            'id': item.get("objectID", ""),
            'slug': slug,
            'title': item.get("title", ""),
            'speakers': item.get("speakers", []),
            'url': f'https://www.ted.com/talks/{slug}'
        })
    return processed

metadata_processed = process_metadata(metadata)

def fetch_talk_details(talks):
    details = []
    for talk in tqdm(talks, desc="Fetching talk details"):
        slug = talk["slug"]
        query = f"""
        {{video(slug: "{slug}", language: "en") {{
            description duration presenterDisplayName publishedAt 
            topics {{nodes {{name}}}}
        }}}}
        """
        try:
            response = requests.post(
                'https://www.ted.com/graphql',
                headers={'Content-type': 'application/json'},
                json=[{"query": query}]
            )
            if response.status_code == 200:
                data = response.json()[0].get("data", {}).get("video", {})
                if data:
                    details.append({
                        "slug": slug,
                        "description": data.get("description", ""),
                        "duration": data.get("duration", 0),
                        "presenter": data.get("presenterDisplayName", ""),
                        "published_at": data.get("publishedAt", ""),
                        "topics": [t["name"] for t in data.get("topics", {}).get("nodes", [])]
                    })
        except Exception as err:
            print(f"Error fetching details for {slug}: {err}")
        time.sleep(DELAY)
    return details


talk_details = fetch_talk_details(metadata_processed)



def save_data(metadata, details):
    pd.DataFrame(metadata).to_csv('ted_talks_metadata.csv', index=False)
    pd.DataFrame(details).to_csv('ted_talks_details.csv', index=False)
    print("Data saved successfully.")

save_data(metadata_processed, talk_details)


Navigating to https://www.ted.com/talks?sort=newest


  0%|          | 0/300 [00:00<?, ?it/s]

Total talks scraped: 6864


  0%|          | 0/6864 [00:00<?, ?it/s]

Saving data to CSV files...
Scraping completed! Data saved to CSV files.
