In [4]:
!pip install selenium 

Collecting selenium
  Downloading selenium-4.31.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.31.0-py3-none-any.whl (9.4 MB)
   ---------------------------------------- 0.0/9.4 MB ? eta -:--:--
   --------------- ------------------------ 3.7/9.4 MB 31.3 MB/s eta 0:00:01
   -------------------- ------------------- 4.7/9.4 MB 13.6 MB/s eta 0:00:01
   ------------------------ --------------- 5.8/9.4 MB 10.1 MB

In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# Setup headless Chrome
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=options)

def extract_table_data():
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    rows = soup.select("table tbody tr")
    data = []

    for row in rows:
        tds = row.find_all('td')
        if len(tds) < 4:
            continue

        name_tag = tds[0].find('a')
        test_name = name_tag.text.strip() if name_tag else ""
        test_link = "https://www.shl.com" + name_tag['href'] if name_tag else ""
        remote = "Yes" if tds[1].find('span', class_='catalogue__circle -yes') else "No"
        adaptive = "Yes" if tds[2].find('span', class_='catalogue__circle -yes') else "No"
        test_types = [span.text.strip() for span in tds[3].find_all('span', class_='product-catalogue__key')]
        test_types = ", ".join(test_types)

        data.append({
            "Test Name": test_name,
            "Link": test_link,
            "Remote Testing": remote,
            "Adaptive/IRT": adaptive,
            "Test Types": test_types
        })
    return data

def process_tab(tab_text, total_pages):
    print(f"\nProcessing: {tab_text}")

    # Wait for the tab container to be loaded
    WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "tab") or contains(@class, "nav") or contains(@class, "tabs")]'))
    )

    # Find possible tabs
    tab_buttons = driver.find_elements(By.XPATH, '//div[contains(@class, "tab") or contains(@class, "nav") or contains(@class, "tabs")]//div | //button')

    print("\n-- Available tabs:")
    for i, btn in enumerate(tab_buttons):
        print(f"[{i}] '{btn.text}'")
    
    # Find and click the right one
    for btn in tab_buttons:
        if tab_text.lower() in btn.text.strip().lower():
            driver.execute_script("arguments[0].click();", btn)
            break
    else:
        raise Exception(f"Tab with text '{tab_text}' not found.")

    time.sleep(2)

    all_data = []

    for page in range(1, total_pages + 1):
        print(f"  Scraping page {page}/{total_pages}...")

        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CLASS_NAME, "custom__table-responsive"))
        )

        all_data.extend(extract_table_data())

        # Click next
        if page < total_pages:
            try:
                next_btn = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.LINK_TEXT, "Next"))
                )
                driver.execute_script("arguments[0].click();", next_btn)
                time.sleep(2)
            except Exception as e:
                print(f"Could not click 'Next' on page {page}: {e}")
                break

    return all_data

# Open the catalog URL
driver.get("https://www.shl.com/solutions/products/product-catalog/")

# Extract both tabs
prepackaged_data = process_tab("Pre-Packaged Job Solutions", total_pages=12)
individual_data = process_tab("Individual Test Solutions", total_pages=32)

driver.quit()

# Combine and save
all_data = prepackaged_data + individual_data
df = pd.DataFrame(all_data)
df.to_csv("shl_full_catalog.csv", index=False)
print("\nSaved all data to 'shl_full_catalog.csv'")



Processing: Pre-Packaged Job Solutions

-- Available tabs:
[0] ''
[1] ''
[2] ''
[3] ''
[4] ''
[5] ''
[6] ''
[7] ''
[8] ''
[9] 'Use necessary cookies only'
[10] ''
[11] 'Customize'
[12] 'Allow all cookies'
[13] ''
[14] ''
[15] ''
[16] ''
[17] 'Contact
Practice Tests
Support





Login
Buy Online'
[18] ''
[19] ''
[20] 'Solutions
HR Priorities
Resources
Careers
About
Book a Demo'
[21] 'Solutions
HR Priorities
Resources
Careers
About
Book a Demo'
[22] ''
[23] ''
[24] ''
[25] ''
[26] ''
[27] 'Solutions
HR Priorities
Resources
Careers
About
Book a Demo'
[28] ''
[29] ''
[30] ''
[31] ''
[32] ''
[33] ''
[34] ''
[35] ''
[36] ''
[37] ''
[38] ''
[39] ''
[40] ''
[41] ''
[42] ''
[43] ''
[44] ''
[45] ''
[46] ''
[47] ''
[48] ''
[49] ''
[50] ''
[51] ''
[52] ''
[53] ''
[54] ''
[55] ''
[56] ''
[57] ''
[58] ''
[59] ''
[60] ''
[61] ''
[62] ''
[63] ''
[64] ''
[65] ''
[66] ''
[67] ''
[68] ''
[69] ''
[70] ''
[71] ''
[72] ''
[73] ''
[74] ''
[75] ''
[76] ''
[77] ''
[78] ''
[79] ''
[80] ''
[81] ''
[82] ''
[83] 