**This notebook explains - Webscraping the CFA Website using Beautiful Soup**

**Introduction** :-

What is Beautiful Soup?

Beautiful Soup is a Python Library used for web scraping information from the website. Very simple to use, it parses the information from the HTML Structure.


**Importing Libraries**

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time


**Initializing the driver in headless mode**

What is headless mode ?

A chrome mode when you can run the browser without any visible UI.

In [None]:
def initialize_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run Chrome in headless mode
    driver = webdriver.Chrome(options=chrome_options)
    driver.maximize_window()
    return driver


**Function to close any Privacy Warnings :-**

In [None]:
def close_privacy_warning(driver):
    try:
        close_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "closePrivacyWarning")))
        close_button.click()
    except (NoSuchElementException, TimeoutException):
        print("Privacy warning not found.")


**Function to go the next button**

For now, the sleep has been adjusted to 5 seconds, but if there's a timeout we can adjust that accordingly

In [None]:
def click_next_button(driver):
    try:
        next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "coveo-pager-next")))
        next_button.click()
        time.sleep(5)  # This can be adjusted to according to us
        return True
    except (NoSuchElementException, TimeoutException):
        return False


**Function to scrape**

Beautiful soup scrapes the HTML Structure of the website, so for now we are just trying to scrape the links and append in the our list

In [None]:
def scrape(driver, refresher_readings_list):
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'coveo-title')))
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')
    titles = soup.find_all('h4', class_='coveo-title')
    for title in titles:
        link = title.find('a')['href']
        reading = [title.text.strip(), link]
        refresher_readings_list.append(reading)


**Function to get Introduction and Learning Outcomes**

After the inspection of the website, we realize that we could find Introduction in Article-section class but for Learning Outcomes it was nested within so we had to twist a logic a little

In [None]:
def get_reading_detail_data(driver, reading):
    driver.get(reading[1])
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "section h2.article-section")))
    except TimeoutException:
        print("Timeout waiting for section headers to load for reading:", reading[0])
        return {"introduction": "", "learning_outcomes": "", "summary": ""}, "Unknown", "Unknown", "No PDF link found"

    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')
    data = {"introduction": "", "learning_outcomes": "", "summary": ""}


    for section in soup.find_all('section'):
        header = section.find('h2', class_='article-section')
        if header and 'Introduction' in header.text.strip():
            data["introduction"] = ' '.join(p.text.strip() for p in section.find_all('p'))

    learning_outcomes_section = soup.find('h2', string='Learning Outcomes')
    if learning_outcomes_section:
        next_section = learning_outcomes_section.find_next_sibling('section')
        if next_section:
            outcomes_list = next_section.find('ul')
            if outcomes_list:
                data["learning_outcomes"] = ' '.join(li.text.strip() for li in outcomes_list.find_all('li'))

    year, level = extract_year_and_level(html_content)
    pdf_link = get_pdf_link(soup)
    return data, year, level, pdf_link

    year, level = extract_year_and_level(html_content)
    pdf_link = get_pdf_link(soup)
    return data, year, level, pdf_link


**Function to get Year and Level**

After the inspection of the website, we realize that we could find year  in content utility class and level there too

In [None]:
def extract_year_and_level(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    content_utility_div = soup.find('div', class_='content-utility')
    year_tag = content_utility_div.find('span', class_='content-utility-curriculum') if content_utility_div else None
    level_tag = content_utility_div.find('span', class_='content-utility-topic') if content_utility_div else None  # Adjusted to target the correct span
    year = year_tag.text.strip() if year_tag else "Unknown"
    level = level_tag.text.strip() if level_tag else "Unknown"
    return year, level


**Function to get Pdf link**

After the inspection of the website,pdf link is found in the locked content class[link text](https://)

In [None]:
def get_pdf_link(soup):
    link_element = soup.find('a', class_='locked-content')  # Search for <a> tag with "locked-content" class
    if link_element and 'href' in link_element.attrs:
        return link_element['href']
    return "No PDF link found"


**Function to scrape reading detail and get it all structured**

In [None]:
def scrape_reading_detail(refresher_readings_list):
    data_list = []
    driver = initialize_driver()
    for reading in refresher_readings_list:
        reading_detail, year, level, pdf_link = get_reading_detail_data(driver, reading)
        data_list.append({
            'Title': reading[0],
            'Year': year,
            'Level': level,
            'Introduction Summary': reading_detail['introduction'],
            'Learning Outcomes': reading_detail['learning_outcomes'],
#             'Summary': reading_detail['summary'],
            'Link to Summary Page': reading[1],
            'Link to PDF File': pdf_link
        })
    driver.quit()
    return pd.DataFrame(data_list)


**Main Function :-**

In [None]:
def main():
    refresher_readings_list = []
    driver = initialize_driver()
    url = "https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first=10&sort=%40refreadingcurriculumyear%20descending"
    driver.get(url)
    close_privacy_warning(driver)
    for page_num in range(23):  # Adjusted loop to check the return value of click_next_button
        scrape(driver, refresher_readings_list)
        if not click_next_button(driver):  # Check the return value to decide whether to continue
            break  # Exit the loop if click_next_button returns False
    df = scrape_reading_detail(refresher_readings_list)
    print(df)
    df.to_csv('refresher_readings.csv', index=False)
    driver.quit()  # Ensure the driver is quit after the scraping is done


if __name__ == "__main__":
    main()
