## Dataset Acquisition

In [None]:
# --- Install dependencies ---
!apt-get update
!apt install -y chromium-chromedriver
!pip install selenium

# --- Imports ---
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,381 kB]
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,809 kB]
Get:13 https://r2u.stat

In [None]:
# --- Initialize headless Chrome browser ---
def init_driver():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    return webdriver.Chrome(options=options)


In [None]:
# --- Function to extract row data from a single row element ---
def extract_row_data(row, table_name):
    try:
        row_id = row.get_attribute("data-course-id") or row.get_attribute("data-entity-id")
        name_elem = row.find_element(By.CSS_SELECTOR, "td.custom__table-heading__title a")
        name = name_elem.text.strip()
        url = name_elem.get_attribute("href")

        remote = "Yes" if row.find_elements(By.CSS_SELECTOR, "td:nth-child(2) .catalogue__circle.-yes") else "No"
        adaptive = "Yes" if row.find_elements(By.CSS_SELECTOR, "td:nth-child(3) .catalogue__circle.-yes") else "No"
        test_types = " ".join([t.text for t in row.find_elements(By.CSS_SELECTOR, ".product-catalogue__key")])

        return {
            "id": row_id,
            "Table": table_name,
            "Pre-packaged Job Solutions": name,
            "URL": url,
            "Remote Testing (y/n)": remote,
            "Adaptive/IRT (y/n)": adaptive,
            "Test Type": test_types
        }
    except Exception as e:
        print(f"Error parsing row: {e}")
        return None

In [None]:

# --- Scrape a single page for a table type ---
def scrape_page(driver, url, table_name):
    print(f"Loading {table_name} page: {url}")
    driver.get(url)

    wait = WebDriverWait(driver, 10)
    row_selector = "tr[data-course-id]" if table_name == "Pre-packaged Job Solutions" else "tr[data-entity-id]"

    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, row_selector)))
        rows = driver.find_elements(By.CSS_SELECTOR, row_selector)
        print(f"Found {len(rows)} rows.")
        return [extract_row_data(row, table_name) for row in rows if extract_row_data(row, table_name)]
    except Exception as e:
        print(f"Error loading page {url}: {e}")
        return []


In [None]:
# --- Scrape a full table across multiple pages ---
def scrape_table(driver, base_url, table_name, start_range, step, type_value, total_pages):
    all_data = []
    for offset in range(start_range, start_range + step * total_pages, step):
        page_url = f"{base_url}?start={offset}&type={type_value}"
        data = scrape_page(driver, page_url, table_name)
        all_data.extend(data)
        print(f"Collected {len(all_data)} rows so far from {table_name}")
    return all_data

In [None]:
# --- Main function ---
def main():
    driver = init_driver()
    base_url = "https://www.shl.com/solutions/products/product-catalog/"

    # Table 1: Pre-packaged Job Solutions
    prepackaged_data = scrape_table(
        driver,
        base_url,
        table_name="Pre-packaged Job Solutions",
        start_range=0,
        step=12,
        type_value=2,
        total_pages=12
    )

    # Table 2: Individual Test Solutions
    individual_data = scrape_table(
        driver,
        base_url,
        table_name="Individual Test Solutions",
        start_range=0,
        step=12,
        type_value=1,
        total_pages=32
    )

    # Combine and save
    combined_data = prepackaged_data + individual_data
    df = pd.DataFrame(combined_data)
    df = df[["id", "Pre-packaged Job Solutions", "URL", "Remote Testing (y/n)", "Adaptive/IRT (y/n)", "Test Type"]]
    df.to_csv("shl_catalog.csv", index=False)
    print("Saved to shl_catalog.csv")
    driver.quit()

# --- Run ---
main()

Loading Pre-packaged Job Solutions page: https://www.shl.com/solutions/products/product-catalog/?start=0&type=2
Found 12 rows.
Collected 12 rows so far from Pre-packaged Job Solutions
Loading Pre-packaged Job Solutions page: https://www.shl.com/solutions/products/product-catalog/?start=12&type=2
Found 12 rows.
Collected 24 rows so far from Pre-packaged Job Solutions
Loading Pre-packaged Job Solutions page: https://www.shl.com/solutions/products/product-catalog/?start=24&type=2
Found 12 rows.
Collected 36 rows so far from Pre-packaged Job Solutions
Loading Pre-packaged Job Solutions page: https://www.shl.com/solutions/products/product-catalog/?start=36&type=2
Found 12 rows.
Collected 48 rows so far from Pre-packaged Job Solutions
Loading Pre-packaged Job Solutions page: https://www.shl.com/solutions/products/product-catalog/?start=48&type=2
Found 12 rows.
Collected 60 rows so far from Pre-packaged Job Solutions
Loading Pre-packaged Job Solutions page: https://www.shl.com/solutions/produ

## More detailed database

In [None]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

# Configure headless Chrome for Colab or similar environment
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Initialize the driver
driver = webdriver.Chrome(options=chrome_options)

In [None]:
# Load your original CSV
data = pd.read_csv("shl_catalog.csv")


# Add placeholder columns for future data
data["Description"] = ""
data["Job Levels"] = ""
data["Languages"] = ""
data["Assessment Length"] = ""

# Function to retrieve specific field content by heading name
def fetch_field(driver, label):
    try:
        sections = driver.find_elements(By.CSS_SELECTOR, "div.product-catalogue-training-calendar__row.typ")
        for section in sections:
            try:
                title = section.find_element(By.TAG_NAME, "h4").text.strip().lower()
                if title == label.lower():
                    return section.find_element(By.TAG_NAME, "p").text.strip()
            except:
                continue
    except:
        return ""
    return ""

In [None]:
# Go through each entry in the dataset
for idx, entry in data.iterrows():
    page_url = entry["URL"]
    print(f"Processing entry {idx+1} of {len(data)}: {page_url}")

    try:
        driver.get(page_url)
        time.sleep(2.5)

        data.at[idx, "Description"] = fetch_field(driver, "Description")
        data.at[idx, "Job Levels"] = fetch_field(driver, "Job levels")
        data.at[idx, "Languages"] = fetch_field(driver, "Languages")

        length = fetch_field(driver, "Assessment length")
        if "=" in length:
            data.at[idx, "Assessment Length"] = length.split("=")[-1].strip()
        else:
            data.at[idx, "Assessment Length"] = length

    except Exception as err:
        print(f"Error at index {idx} for URL: {page_url}")
        print(err)

# Save the updated dataset
data.to_csv("shl_catalog_detailed.csv", index=False)
print("Completed! File saved as shl_catalog_detailed.csv")

Processing entry 1 of 518: https://www.shl.com/solutions/products/product-catalog/view/account-manager-solution/
Processing entry 2 of 518: https://www.shl.com/solutions/products/product-catalog/view/administrative-professional-short-form/
Processing entry 3 of 518: https://www.shl.com/solutions/products/product-catalog/view/agency-manager-solution/
Processing entry 4 of 518: https://www.shl.com/solutions/products/product-catalog/view/apprentice-8-0-job-focused-assessment-4261/
Processing entry 5 of 518: https://www.shl.com/solutions/products/product-catalog/view/apprentice-8-0-job-focused-assessment/
Processing entry 6 of 518: https://www.shl.com/solutions/products/product-catalog/view/bank-administrative-assistant-short-form/
Processing entry 7 of 518: https://www.shl.com/solutions/products/product-catalog/view/bank-collections-agent-short-form/
Processing entry 8 of 518: https://www.shl.com/solutions/products/product-catalog/view/bank-operations-supervisor-short-form/
Processing ent