In [32]:
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

# --- Setup Selenium in headless mode ---
options = Options()
options.add_argument("--headless=new")  # better compatibility with Chrome 115+
driver = webdriver.Chrome(options=options)

# --- Load CSV ---
df = pd.read_csv("shl_full_catalog.csv")

# Add empty columns
df["description"] = ""
df["duration"] = ""

# --- Loop through all rows ---
for idx, row in tqdm(df.iterrows(), total=len(df)):
    url = row['Link']
    try:
        driver.get(url)
        time.sleep(2.5)  # wait for content to load

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Look through content blocks
        content_blocks = soup.find_all("div", class_="product-catalogue-training-calendar__row")
        description = ""
        duration = ""

        for block in content_blocks:
            title_tag = block.find("h4")
            if not title_tag:
                continue
            title = title_tag.get_text(strip=True).lower()

            value_tag = block.find("p")
            value = value_tag.get_text(strip=True) if value_tag else ""

            if "description" in title:
                description = value
            elif "assessment length" in title:
                match = re.search(r'\d+', value)
                if match:
                    duration = f"{match.group()} minutes"
                else:
                    duration = value

        # Save to DataFrame
        df.at[idx, "description"] = description
        df.at[idx, "duration"] = duration or "Unknown"

    except Exception as e:
        print(f"❌ Failed to process {url}: {e}")
        df.at[idx, "description"] = ""
        df.at[idx, "duration"] = "Unknown"

# --- Save updated CSV ---
df.to_csv("shl_enriched_catalog.csv", index=False)
driver.quit()
print("✅ Saved: shl_enriched_catalog.csv")


100%|████████████████████████████████████████████████████████████████████████████████| 441/441 [29:36<00:00,  4.03s/it]


✅ Saved: shl_enriched_catalog.csv
