In [10]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [1]:
BASE_URL = "https://www.bls.gov"
OOH_URL = f"{BASE_URL}/ooh/"

def get_occupation_groups():
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    driver.get(OOH_URL)
    time.sleep(3)  # Allow time for the page to load
    
    groups = []
    group_elements = driver.find_elements(By.CSS_SELECTOR, "#ooh-occupation-list ul.ooh-groups-col li a")
    for group in group_elements:
        name = group.text.strip()
        url = BASE_URL + group.get_attribute("href").strip()
        groups.append((name, url))
    
    driver.quit()
    return groups

# Scraping Occupation Groups
groups = get_occupation_groups()

df_groups = pd.DataFrame(groups, columns=["OCC_GROUP", "OCC_GROUP_URL"])


In [2]:
new_urls = []
for i in df_groups['OCC_GROUP_URL']:
    new_urls.append(i[19:])
df_groups['OCC_GROUP_URL'] =  new_urls

In [None]:
# Initialize WebDriver
def init_driver():
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    return driver

def get_occupation_profiles(driver, group_url):
    driver.get(group_url)
    time.sleep(5)  # Allow time for the page to load
    
    profiles = []
    profile_elements = driver.find_elements(By.CSS_SELECTOR, "#landing-page-table td.sorting_1 a")
    for profile in profile_elements:
        title = profile.text.strip()
        url = BASE_URL + profile.get_attribute("href").strip()
        profiles.append((title, url))
    
    return profiles

driver = init_driver()

# Scraping Occupation Profiles
all_profiles = []
for _, row in df_groups.iterrows():
    all_profiles.extend(get_occupation_profiles(driver, row["OCC_GROUP_URL"]))

driver.quit()

# Create DataFrame
df_profiles = pd.DataFrame(all_profiles, columns=["OCC_TITLE", "OCC_TITLE_URL"])

# Display DataFrame
df_profiles.head()

Unnamed: 0,OCC_TITLE,OCC_TITLE_URL
0,Aerospace Engineering and Operations Technolog...,https://www.bls.govhttps://www.bls.gov/ooh/arc...
1,Aerospace Engineers,https://www.bls.govhttps://www.bls.gov/ooh/arc...
2,Agricultural Engineers,https://www.bls.govhttps://www.bls.gov/ooh/arc...
3,Architects,https://www.bls.govhttps://www.bls.gov/ooh/arc...
4,Bioengineers and Biomedical Engineers,https://www.bls.govhttps://www.bls.gov/ooh/arc...


In [4]:
new_urls = []
for i in df_profiles['OCC_TITLE_URL']:
    new_urls.append(i[19:])
df_profiles['OCC_TITLE_URL'] =  new_urls

In [None]:
# Initialize WebDriver
def init_driver():
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    return driver

def get_profile_data(driver, profile_url):
    driver.get(profile_url)
    time.sleep(2)  # Allow time for the page to load
    
    try:
        quickfacts_table = driver.find_element(By.ID, "quickfacts")
        rows = quickfacts_table.find_elements(By.TAG_NAME, "tr")
        
        median_pay_year, median_pay_hour, job_outlook = None, None, None
        education, experience, training, jobs_2023, employment_change = None, None, None, None, None
        
        for row in rows:
            try:
                header = row.find_element(By.TAG_NAME, "th").text.strip()
                value = row.find_element(By.TAG_NAME, "td").text.strip()
                
                if "Median Pay" in header:
                    pay_values = value.split("\n")
                    if len(pay_values) > 1:
                        median_pay_year = pay_values[0].strip()
                        median_pay_hour = pay_values[1].strip()
                elif "Job Outlook" in header:
                    job_outlook = value
                elif "Typical Entry-Level Education" in header:
                    education = value
                elif "Work Experience" in header:
                    experience = value
                elif "On-the-job Training" in header:
                    training = value
                elif "Number of Jobs" in header:
                    jobs_2023 = value
                elif "Employment Change" in header:
                    employment_change = value
            except:
                continue
        
        # Extract job duties
        duties = ""
        try:
            tab_button = driver.find_element(By.CSS_SELECTOR, "a[href='#tab-2']")
            driver.execute_script("arguments[0].click();", tab_button)
            time.sleep(1)
            duties_section = driver.find_element(By.ID, "tab-2")
            duty_elements = duties_section.find_elements(By.TAG_NAME, "li")
            duties = ", ".join([duty.text.strip() for duty in duty_elements if duty.text.strip()])
        except:
            pass
        
        # Extract education, qualities, and certifications
        education_text, qualities, certifications = "", "", ""
        try:
            tab_button = driver.find_element(By.CSS_SELECTOR, "a[href='#tab-4']")
            driver.execute_script("arguments[0].click();", tab_button)
            time.sleep(1)
            become_one_section = driver.find_element(By.ID, "tab-4")
            
            paragraphs = become_one_section.find_elements(By.TAG_NAME, "p")
            education_text = " ".join([p.text.strip() for p in paragraphs if p.text.strip()])
            
            qualities_elements = become_one_section.find_elements(By.TAG_NAME, "strong")
            qualities = ", ".join([q.text.strip() for q in qualities_elements])
            
            certifications_section = become_one_section.find_elements(By.TAG_NAME, "a")
            certifications = ", ".join([c.text.strip() for c in certifications_section if "Certification" in c.text])
        except:
            pass
        
        return [profile_url, median_pay_year, median_pay_hour, job_outlook, education, experience, training, jobs_2023, employment_change, 
                duties, education_text, qualities, certifications]
    except:
        return None

driver = init_driver()

# Scraping profile data
profile_data = []
for _, row in df_profiles.iterrows():
    data = get_profile_data(driver, row["OCC_TITLE_URL"])
    if data:
        profile_data.append([row["OCC_TITLE"]] + data)

driver.quit()

# Create DataFrame
df_full_profiles = pd.DataFrame(profile_data, 
                                columns=["OCC_TITLE", "OCC_TITLE_URL", "MEDIAN_PAY_YEAR", "MEDIAN_PAY_HOUR", "JOB_OUTLOOK", 
                                         "EDUCATION", "EXPERIENCE", "TRAINING", "JOBS_2023", "EMPLOYMENT_CHANGE", 
                                         "DUTIES", "EDUCATION_DETAILS", "QUALITIES", "CERTIFICATIONS"])

df_full_profiles.to_csv('bls_occupations_profiles.csv', index=False)

In [9]:
df_full_profiles.head()

Unnamed: 0,OCC_TITLE,OCC_TITLE_URL,MEDIAN_PAY_YEAR,MEDIAN_PAY_HOUR,JOB_OUTLOOK,EDUCATION,EXPERIENCE,TRAINING,JOBS_2023,EMPLOYMENT_CHANGE,DUTIES,EDUCATION_DETAILS,QUALITIES,CERTIFICATIONS
0,Aerospace Engineering and Operations Technolog...,https://www.bls.gov/ooh/architecture-and-engin...,"$77,830 per year",$37.42 per hour,8% (Faster than average),Associate's degree,,,11000,900,Meet with aerospace engineers to discuss detai...,Aerospace engineering and operations technolog...,"Communication skills., Detail oriented., Inter...",Certified Aerospace Technician Core Certification
1,Aerospace Engineers,https://www.bls.gov/ooh/architecture-and-engin...,"$130,720 per year",$62.85 per hour,6% (Faster than average),Bachelor's degree,,,68900,4100,"Coordinate and direct the design, manufacture,...",Aerospace engineers typically need a bachelor’...,"Analytical skills., Business skills., Communic...",
2,Agricultural Engineers,https://www.bls.gov/ooh/architecture-and-engin...,"$88,750 per year",$42.67 per hour,8% (Faster than average),Bachelor's degree,,,1900,100,"Use computer software to design equipment, sys...",Agricultural engineers typically need a bachel...,"Analytical skills., Communication skills., Mat...",
3,Architects,https://www.bls.gov/ooh/architecture-and-engin...,"$93,310 per year",$44.86 per hour,8% (Faster than average),Bachelor's degree,,Internship/residency,127300,9900,Meet with clients to determine objectives and ...,There are typically three main steps to becomi...,"Analytical skills., Communication skills., Cre...",
4,Bioengineers and Biomedical Engineers,https://www.bls.gov/ooh/architecture-and-engin...,"$100,730 per year",$48.43 per hour,7% (Faster than average),Bachelor's degree,,,19700,1500,"Design equipment and devices, such as artifici...",Bioengineers and biomedical engineers typicall...,"Analytical skills., Communication skills., Cre...",
