## Scraping 224 URL's from Finance Institute's Webpage

### Importing required packages

In [14]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time

### Scrape function to extract links which comes as part of `CoveoResultLink` class under anchor tag in the given webpage and save those to a .txt file

- Strategy used involves edge browser webdriver to make the beutiful soup scraper wait for a certain amount of time before scraping as the links which needs to be extracted loads dynamically over the page after a particular Javascript snippet gets successfully executed.

In [15]:
def scrape_coveo_links(url):
    # Set up the WebDriver with the command line flag for Edge
    edge_options = webdriver.EdgeOptions()
    edge_options.add_argument('--enable-chrome-browser-cloud-management')

    driver = webdriver.Edge(options=edge_options)

    try:
        # Make a request using Selenium
        driver.get(url)

        # Wait for the dynamic content to load (you may need to adjust the sleep duration)
        time.sleep(5)

        # Get the page source after dynamic content has loaded
        page_source = driver.page_source

        # Parse the HTML content of the page
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find all the links with class 'coveo'
        coveo_links = soup.find_all('a', class_='CoveoResultLink')

        # Extract and write the href attribute of each coveo link to a file
        with open("../sample_output/224_links.txt", "a") as file:
            for link in coveo_links:
                href = link.get('href')
                if href:
                    file.write(href + '\n')

        # Print the total number of coveo links
        print(f"Total number of Coveo class links: {len(coveo_links)}")
        print("Coveo class links saved to '224_links.txt'")

    finally:
        # Close the WebDriver in a 'finally' block to ensure it is closed even if an exception occurs
        driver.quit()

### Looping to scrape links page by page taking `pagination` into account

In [16]:
count = 2

file_path = '../sample_output/224_links.txt'  # Replace with the actual file path
## Create/overwrite the file to empty it
try:
    # Open the file in write mode ('w') or truncate mode ('w+')
    with open(file_path, 'w+', encoding='utf-8'):
        pass  # The 'pass' statement does nothing, effectively emptying the file

    print(f"The file '{file_path}' has been emptied.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' does not exist.")
except Exception as e:
    print(f"An error occurred: {e}")

while(count>=0):
    url = f"https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first={count*100}&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100"
    scrape_coveo_links(url)
    count = count - 1 

The file '../sample_output/224_links.txt' has been emptied.
Total number of Coveo class links: 24
Coveo class links saved to '224_links.txt'
Total number of Coveo class links: 100
Coveo class links saved to '224_links.txt'
Total number of Coveo class links: 100
Coveo class links saved to '224_links.txt'
