In [1]:
!pip install selenium chromedriver-py tqdm pandas requests



In [2]:
# Import Libraries
import sys
import logging
import time
import json
from selenium.webdriver.remote.remote_connection import LOGGER
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from tqdm.notebook import tqdm
import pandas as pd
from chromedriver_py import binary_path
import requests

# Reduce log verbosity
LOGGER.setLevel(logging.WARNING)

# Browser Setup
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("window-size=1900,800")
chrome_options.add_argument(
    "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
)

def get_browser():
    service = Service(executable_path=binary_path)
    wd = webdriver.Chrome(service=service, options=chrome_options)
    return wd

# Initialize browser
browser = get_browser()

# Scraper Configuration
max_page = 300  # Set to -1 for unlimited pages
sleep_time = 1
final = []

# Function to scrape TEDx talks
def scrape_talks():
    print("Starting TEDx talks scraping...")
    for page in tqdm(range(0, max_page)):
        payload = [
            {
                "indexName": "newest",
                "params": {
                    "attributeForDistinct": "objectID",
                    "distinct": 1,
                    "facets": ["subtitle_languages", "tags"],
                    "highlightPostTag": "__/ais-highlight__",
                    "highlightPreTag": "__ais-highlight__",
                    "hitsPerPage": 24,
                    "maxValuesPerFacet": 500,
                    "page": page,
                    "query": "",
                    "tagFilters": ""
                },
            }
        ]
        try:
            r = requests.post(
                'https://zenith-prod-alt.ted.com/api/search',
                headers={
                    'Content-type': 'application/json; charset=UTF-8',
                    "User-Agent": "curl/7.64.1",
                },
                json=payload,
            )
            print(f"Page {page} - Status Code: {r.status_code}")
            if r.status_code == 200 and r.headers.get('Content-Type') == 'application/json':
                data = r.json()
                talks = data.get('results', [{}])[0].get("hits", [])
                final.extend(talks)
            else:
                print(f"Unexpected response on page {page}: {r.text}")
        except requests.RequestException as e:
            print(f"Request failed on page {page}: {e}")
        except JSONDecodeError as e:
            print(f"JSON decode error on page {page}: {e}")
        time.sleep(sleep_time)

# Parse talk details
def parse_talks():
    final_list = []
    for talk in final:
        slug = talk["slug"]
        final_list.append(
            {
                'id': talk["objectID"],
                'slug': talk["slug"],
                'speakers': talk["speakers"],
                'title': talk["title"],
                "url": f'https://www.ted.com/talks/{slug}',
            }
        )
    return final_list

# Scrape transcript for each talk
def scrape_transcripts(talks_list):
    transcripts = []
    for talk in tqdm(talks_list):
        url = talk["url"] + "/transcript"
        try:
            browser.get(url)
            time.sleep(sleep_time)
            transcript_elements = browser.find_elements(By.CLASS_NAME, "Grid__cell")
            transcript = " ".join([elem.text for elem in transcript_elements])
            talk["transcript"] = transcript
            transcripts.append(talk)
        except Exception as e:
            print(f"Failed to fetch transcript for {talk['url']}: {e}")
    return transcripts

# Save data to CSV
def save_to_csv(data, filename):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Saved data to {filename}")

# Run Scraper
scrape_talks()
talks = parse_talks()
transcripts = scrape_transcripts(talks)

# Save results
save_to_csv(talks, "tedx_talks.csv")
save_to_csv(transcripts, "tedx_transcripts.csv")

# Close the browser
browser.quit()
print("TEDx Scraping Completed!")

Starting TEDx talks scraping...


  0%|          | 0/300 [00:00<?, ?it/s]

JSONDecodeError: Expecting value: line 1 column 1 (char 0)