### Step 1: Install Required Libraries

In [None]:
pip install selenium webdriver-manager pandas beautifulsoup4

### Step 2: Import Libraries

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import re  # For cleaning text using regex
from bs4 import BeautifulSoup

import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

### Step 3: Set Up Selenium WebDriver

In [7]:
# Set up Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
chrome_options.add_argument("--window-size=1920,1080")  # Set window size

# Initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

### Step 4: Load the Web Page
Load the target URL and wait for the initial cards to load.

In [9]:
# URL of the page to scrape
url = "https://www.coursera.org/search?isPartOfCourseraPlus=true&sortBy=BEST_MATCH&trk_ref=globalnav"

# Load the page
driver.get(url)

# Wait for the page to fully load
try:
    WebDriverWait(driver, 20).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )
except Exception as e:
    print(f"Error waiting for page to load: {e}")
    driver.quit()
    exit()

### Step 5: Click the "View All" Button
Locate and click the "View all (31 more)" button to load additional cards.

In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Set up Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless=new")  # Use the new headless mode
chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
chrome_options.add_argument("--window-size=1920,1080")  # Set window size

try:
    # Initialize the WebDriver
    print("Initializing WebDriver...")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    print("WebDriver initialized successfully.")

    # URL of the page to scrape
    url = "https://www.coursera.org/search?isPartOfCourseraPlus=true&sortBy=BEST_MATCH&trk_ref=globalnav"

    # Load the page
    print(f"Loading URL: {url}")
    driver.get(url)

    # Wait for the page to fully load
    print("Waiting for page to load...")
    WebDriverWait(driver, 20).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )

    # Function to extract course titles
    def extract_course_titles():
        # Find all course title elements
        try:
            course_titles = driver.find_elements(By.CSS_SELECTOR, "a.cds-119 cds-113 cds-115 cds-CommonCard-titleLink css-vflzcf cds-142")
            if not course_titles:
                print("No course titles found with the current selector. Please check the CSS selector.")
            return [title.text for title in course_titles]
        except Exception as e:
            print(f"Error extracting course titles: {e}")
            return []

    # Extract initial course titles
    print("Extracting initial course titles...")
    initial_titles = extract_course_titles()
    print(f"Initial number of course cards: {len(initial_titles)}")
    for title in initial_titles:
        print(title)

    # Function to scroll to the bottom of the page
    def scroll_to_bottom():
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            # Scroll down to the bottom
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Wait for new content to load

            # Calculate new scroll height and compare with last scroll height
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break  # Stop scrolling if no new content is loaded
            last_height = new_height

    # Scroll to load additional course cards
    print("Scrolling to load additional course cards...")
    scroll_to_bottom()

    # Extract all course titles after scrolling
    print("Extracting all course titles after scrolling...")
    all_titles = extract_course_titles()
    print(f"Total number of course cards: {len(all_titles)}")
    for title in all_titles:
        print(title)

except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the browser
    print("Closing WebDriver...")
    driver.quit()

Initializing WebDriver...
WebDriver initialized successfully.
Loading URL: https://www.coursera.org/search?isPartOfCourseraPlus=true&sortBy=BEST_MATCH&trk_ref=globalnav
Waiting for page to load...
Extracting initial course titles...
No course titles found with the current selector. Please check the CSS selector.
Initial number of course cards: 0
Scrolling to load additional course cards...
Extracting all course titles after scrolling...
No course titles found with the current selector. Please check the CSS selector.
Total number of course cards: 0
Closing WebDriver...


In [None]:
# Click the "View all (31 more)" button to load additional cards
try:
    view_all_button = driver.find_element(By.CSS_SELECTOR, 'button[data-track-component="view_all_career_cards"]')
    view_all_button.click()
    
    # Wait for the additional cards to load
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CLASS_NAME, "cds-9.css-b1mnpw.cds-11.cds-grid-item"))
    )
except Exception as e:
    print(f"Error clicking the 'View all' button or waiting for additional cards: {e}")
    driver.quit()
    exit()

### Step 6: Parse the Page with BeautifulSoup
Use BeautifulSoup to parse the page source and extract all cards.

In [None]:
# Get the page source and parse it with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Find all the cards with the specified class
cards = soup.find_all('li', class_='cds-9 css-b1mnpw cds-11 cds-grid-item')

print(f"Number of cards found: {len(cards)}")

### Step 7: Extract Data from Each Card
Loop through each card and extract the title, description, median salary, and jobs available. Use regex to clean the salary and jobs available text.

In [None]:
# Initialize a list to store the data
data = []

# Loop through each card and extract the details
for card in cards:
    try:
        # Extract the title
        title_element = card.find('h2', class_='cds-119 cds-Typography-base css-bbd009 cds-121')
        
        if title_element:
            title = title_element.text.strip()
        else:
            title = "Title not found"
            print("Title not found in card:", card.prettify())  # Debugging: Print the card HTML
        
        # Extract the description
        description = card.find('p', class_='css-4s48ix').text.strip()
        
        # Extract the salary and jobs available
        salary_jobs = card.find('div', class_='css-hr97go').text.strip()
        
        # Use regex to extract only the numbers
        median_salary = re.search(r'\$\d{1,3}(?:,\d{3})*', salary_jobs).group()  # Extract $90,500
        jobs_available = re.search(r'\d{1,3}(?:,\d{3})*', salary_jobs.split('jobs')[0]).group()  # Extract 82,489
        
        # Append the extracted data to the list
        data.append({
            'Title': title,
            'Description': description,
            'Median Salary': median_salary,
            'Jobs Available': jobs_available
        })
    except AttributeError as e:
        print(f"Error extracting data from a card: {e}")
        continue

### Step 8: Convert Data to a DataFrame
Convert the list of dictionaries into a pandas DataFrame.

In [None]:
# Convert the list to a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame as a table
print(df)

### Step 9: Save the DataFrame as a CSV File
Save the DataFrame to a CSV file for further analysis or sharing.

In [None]:
# Save the DataFrame to a CSV file
df.to_csv('career_academy_data.csv', index=False)

print("Data saved to 'career_academy_data.csv'")

### Step 10: Close the Browser
Close the Selenium WebDriver to free up resources.

In [None]:
# Close the browser
driver.quit()

In [None]:
df

In [None]:
# Convert 'Median Salary' to numeric for sorting
df['Salary'] = df['Median Salary'].replace('[\$,]', '', regex=True).astype(float)

# Sort by 'Median Salary' in descending order
df_sorted = df.sort_values(by='Salary', ascending=False)

# Convert 'Median Salary' back to formatted string
df_sorted['Salary'] = df_sorted['Salary'].apply(lambda x: f"${x:,.0f}")

# Display the sorted DataFrame
print(df_sorted[['Title', 'Salary']])

In [None]:
df