In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time



In [None]:
CATALOG_URL = "https://www.shl.com/solutions/products/product-catalog/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0.4472.124 Safari/537.36"
}

In [None]:

def parse_assessment_table(table):
    """
    Extracts assessment details from an HTML table element.
    """
    results = []
    rows = table.select("tr")[1:]  # Skip the header row

    for row in rows:
        columns = row.select("td")
        if len(columns) < 4:
            continue

        # Extract name and URL
        title_tag = columns[0].find("a")
        title = title_tag.text.strip() if title_tag else "Untitled"
        link = "https://www.shl.com" + title_tag.get("href", "") if title_tag else ""

        # Remote Testing Support
        is_remote = "Yes" if columns[1].select_one(".catalogue__circle.-yes") else "No"

        # Adaptive/IRT Support
        is_adaptive = "Yes" if columns[2].select_one(".catalogue__circle.-yes") else "No"

        # Test Types
        type_tags = columns[3].select(".product-catalogue__key")
        test_types = ", ".join(tag.text.strip() for tag in type_tags) if type_tags else "N/A"

        results.append({
            "Assessment Name": title,
            "URL": link,
            "Remote Testing": is_remote,
            "Adaptive/IRT": is_adaptive,
            "Test Type": test_types,
            "Duration": "N/A"  # Duration not available from table
        })

    return results


In [None]:

def collect_assessments(base_url, label, query_type):
    """
    Scrapes assessment entries from paginated catalog sections.
    """
    current_url = f"{base_url}?type={query_type}"
    full_results = []

    while current_url:
        response = requests.get(current_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Error fetching {current_url} - Status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.content, "html.parser")
        print(f"📄 Processing page: {soup.title.string.strip() if soup.title else 'Untitled'}")

        relevant_table = None
        for tbl in soup.find_all("table"):
            title_cell = tbl.find("th", class_="custom__table-heading__title")
            if title_cell and label in title_cell.text:
                relevant_table = tbl
                break

        if not relevant_table:
            print(f"⚠️ Couldn't find the expected table: {label}")
            break

        items = parse_assessment_table(relevant_table)
        full_results.extend(items)

        next_page = soup.find("a", text="Next")
        if next_page and next_page.get("href"):
            current_url = "https://www.shl.com" + next_page["href"]
            print(f"➡️ Navigating to next page...\n")
            time.sleep(1)
        else:
            print("✅ No more pages to scrape.")
            current_url = None

    return full_results



In [None]:
def scrape_shl_data():
    """
    Runs the full scraping process for SHL Individual & Pre-packaged test solutions.
    """
    print("🚀 Starting SHL assessment scraping...")

    prebuilt_tests = collect_assessments(CATALOG_URL, "Pre-packaged Job Solutions", "2")
    individual_tests = collect_assessments(CATALOG_URL, "Individual Test Solutions", "1")

    all_tests = prebuilt_tests + individual_tests
    df = pd.DataFrame(all_tests)

    print(f"✅ Scraped {len(df)} total assessments.")
    return df




In [None]:
def export_results(df, filename="shl_assesment_results.csv"):
    """
    Saves results to a CSV file.
    """
    if df.empty:
        print("⚠️ No data found to export.")
    else:
        df.to_csv(filename, index=False)
        print(f"📁 Data saved to {filename}")

if __name__ == "__main__":
    data_frame = scrape_shl_data()
    export_results(data_frame)

🚀 Starting SHL assessment scraping...
📄 Processing page: Talent Assessments Catalog | SHL
➡️ Navigating to next page...



  next_page = soup.find("a", text="Next")


📄 Processing page: Talent Assessments Catalog | SHL
➡️ Navigating to next page...

📄 Processing page: Talent Assessments Catalog | SHL
➡️ Navigating to next page...

📄 Processing page: Talent Assessments Catalog | SHL
➡️ Navigating to next page...

📄 Processing page: Talent Assessments Catalog | SHL
➡️ Navigating to next page...

📄 Processing page: Talent Assessments Catalog | SHL
➡️ Navigating to next page...

📄 Processing page: Talent Assessments Catalog | SHL
➡️ Navigating to next page...

📄 Processing page: Talent Assessments Catalog | SHL
➡️ Navigating to next page...

📄 Processing page: Talent Assessments Catalog | SHL
➡️ Navigating to next page...

📄 Processing page: Talent Assessments Catalog | SHL
➡️ Navigating to next page...

📄 Processing page: Talent Assessments Catalog | SHL
➡️ Navigating to next page...

📄 Processing page: Talent Assessments Catalog | SHL
✅ No more pages to scrape.
📄 Processing page: Talent Assessments Catalog | SHL
⚠️ Couldn't find the expected table: In

In [None]:
# prompt: load shl_catalog_result csv as dataframe

import pandas as pd

# Assuming the CSV file is in the current working directory or provide the full path
df = pd.read_csv('/content/shl_assesment_results.csv')
print(df.head()) # Display first few rows to verify


                            Assessment Name  \
0                  Account Manager Solution   
1  Administrative Professional - Short Form   
2                   Agency Manager Solution   
3   Apprentice + 8.0 Job Focused Assessment   
4     Apprentice 8.0 Job Focused Assessment   

                                                 URL Remote Testing  \
0  https://www.shl.com/solutions/products/product...            Yes   
1  https://www.shl.com/solutions/products/product...            Yes   
2  https://www.shl.com/solutions/products/product...            Yes   
3  https://www.shl.com/solutions/products/product...            Yes   
4  https://www.shl.com/solutions/products/product...            Yes   

  Adaptive/IRT   Test Type  Duration  
0          Yes  C, P, A, B       NaN  
1          Yes     A, K, P       NaN  
2          Yes  A, B, P, S       NaN  
3           No        B, P       NaN  
4           No        B, P       NaN  


In [None]:


# Find duplicate rows based on all columns
duplicate_rows = df[df.duplicated(keep=False)]

# Sort the DataFrame to group duplicates together
duplicate_rows = duplicate_rows.sort_values(by=list(df.columns))

duplicate_rows


Unnamed: 0,Assessment Name,URL,Remote Testing,Adaptive/IRT,Test Type,Duration


In [None]:
df.columns


Index(['Assessment Name', 'URL', 'Remote Testing', 'Adaptive/IRT', 'Test Type',
       'Duration'],
      dtype='object')

In [None]:

import time

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/91.0.4472.124 Safari/537.36"
}

def get_detail_data(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        def extract_section_by_heading(heading):
            divs = soup.find_all("div", class_="product-catalogue-training-calendar__row")
            for div in divs:
                h4 = div.find("h4")
                if h4 and h4.text.strip() == heading:
                    p = div.find("p")
                    return p.get_text(strip=True) if p else "N/A"
            return "N/A"

        description = extract_section_by_heading("Description")
        job_levels = extract_section_by_heading("Job levels")
        languages = extract_section_by_heading("Languages")
        assessment_length = extract_section_by_heading("Assessment length")

        return description, job_levels, languages, assessment_length

    except Exception as e:
        print(f"⚠️ Error fetching {url}: {e}")
        return "N/A", "N/A", "N/A", "N/A"


def enrich_with_details(csv_path, output_path="/content/shl_assesment_results_final.csv"):
    df = pd.read_csv(csv_path)

    # Create empty new columns
    df["description"] = ""
    df["job_levels"] = ""
    df["languages"] = ""
    df["assessment_length"] = ""

    for i, row in df.iterrows():
        print(f"🔎 Scraping {i+1}/{len(df)}: {row['Assessment Name']}")
        desc, levels, langs, length = get_detail_data(row["URL"])
        df.at[i, "description"] = desc
        df.at[i, "job_levels"] = levels
        df.at[i, "languages"] = langs
        df.at[i, "assessment_length"] = length

        # Sleep a bit to avoid hammering the server
        time.sleep(1)

    df.to_csv(output_path, index=False)
    print(f"✅ Enriched data saved to {output_path}")

if __name__ == "__main__":
    # Adjust filename to your CSV file
    enrich_with_details("/content/shl_assesment_results.csv")


🔎 Scraping 1/141: Account Manager Solution
🔎 Scraping 2/141: Administrative Professional - Short Form
🔎 Scraping 3/141: Agency Manager Solution
🔎 Scraping 4/141: Apprentice + 8.0 Job Focused Assessment
🔎 Scraping 5/141: Apprentice 8.0 Job Focused Assessment
🔎 Scraping 6/141: Bank Administrative Assistant - Short Form
🔎 Scraping 7/141: Bank Collections Agent - Short Form
🔎 Scraping 8/141: Bank Operations Supervisor - Short Form
🔎 Scraping 9/141: Bilingual Spanish Reservation Agent Solution
🔎 Scraping 10/141: Bookkeeping, Accounting, Auditing Clerk Short Form
🔎 Scraping 11/141: Branch Manager - Short Form
🔎 Scraping 12/141: Cashier Solution
🔎 Scraping 13/141: Claims/Operations Supervisor Solution
🔎 Scraping 14/141: Contact Center Customer Service + 8.0
🔎 Scraping 15/141: Contact Center Customer Service 8.0
🔎 Scraping 16/141: Contact Center Manager - Short Form
🔎 Scraping 17/141: Contact Center Sales & Service + 8.0
🔎 Scraping 18/141: Contact Center Sales & Service 8.0
🔎 Scraping 19/141: 