In [None]:
#Installing the required packages using pip
%pip install selenium
%pip install webdriver-manager
%pip install beautifulsoup4

In [None]:
# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import csv

In [None]:
# Function to setup Chrome WebDriver
def setup_driver():
    options = webdriver.ChromeOptions()
    # Initialize Chrome WebDriver with options
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

# Function to get reading links from a given URL
def get_reading_links(driver, url):
    driver.get(url)
    time.sleep(2) # Wait for the page to load
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    # Extract reading links using CSS selector
    reading_links = [a['href'] for a in soup.select('a.CoveoResultLink')]
    return reading_links

# Function to scrape content from each reading link
def scrape_reading_content(driver, links):
    readings = []
    for link in links:
        driver.get(link)
        time.sleep(2) # Wait for the page to load
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        # Extract relevant information from the page
        title = soup.find('h1').text.strip() if soup.find('h1') else 'Title Not Found'

        # Function to extract text based on header texts
        def extract_text_by_header(soup, header_texts):
            content = ""
            # Normalize header_texts to lowercase for case-insensitive comparison
            header_texts = [text.lower() for text in header_texts]
            headers = soup.find_all('h2', class_='article-section')
            for header in headers:
                if header.text.strip().lower() in header_texts:
                    current_element = header.find_next_sibling()
                    while current_element and current_element.name != 'h2':
                        if current_element.name in ['p', 'div', 'ol']:
                            content += '\n' + ' '.join(current_element.stripped_strings)
                        current_element = current_element.find_next_sibling()
                    if content:  # Break if content has been found
                        break
            return content.strip()

        # Function to extract learning outcomes
        def extract_learning_outcomes(soup):
            content = ""
            header = soup.find('h2', text=lambda t: "Learning Outcomes" in t, class_='article-section')
            if header:
                section = header.find_next_sibling('section')
                if section:
                    content = ' '.join(section.stripped_strings)
            return content.strip()


        # Extracts the introduction section from the page using predefined headers.
        introduction = extract_text_by_header(soup, ['Introduction', 'Overview', 'INTRODUCTION'])
        # Extracts the learning outcomes section from the page.
        learning_outcomes = extract_learning_outcomes(soup)
        # Extracts the summary section from the page using the 'Summary' header.
        summary = extract_text_by_header(soup, ['Summary'])

        # Find and extract the required publication year, level, links to the pdf
        year = soup.find("span", class_="content-utility-curriculum").text.strip() if soup.find("span", class_="content-utility-curriculum") else "Year Not Found"
        level = soup.find("span", class_="content-utility-level").text.strip() if soup.find("span", class_="content-utility-level") else "Level Not Found"
        link_to_full_pdf = soup.find("a", class_="locked-content")["href"].strip() if soup.find("a", class_="locked-content") else "Link Not Found"


        # Appends the extracted information as a dictionary to the readings list.
        readings.append({
            'Name of the topic': title,
            'Year': year,
            'Level' : level,
            'Introduction': introduction,
            'Learning Outcomes': learning_outcomes,
            'Summary': summary,
            'Link to the Summary Page': link,
            'Link to the PDF File': link_to_full_pdf
        })

    return readings

# Function to save the scraped data into a CSV file.
def save_to_csv(readings, filename="Team05.csv"):
    keys = readings[0].keys() # Extracts the keys from the first dictionary to use as column headers.
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader() # Writes the column headers.
        dict_writer.writerows(readings) # Writes the rows of data.


# Imports for handling specific exceptions from Selenium.
from selenium.common.exceptions import NoSuchElementException, JavascriptException

def setup_driver():
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def main():
    driver = setup_driver() # Initializes the WebDriver.
    # List of URLs to scrape.
    urls = ['https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#sort=%40refreadingcurriculumyear%20descending&numberOfResults=100',
            'https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first=100&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100',
            'https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first=200&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100',]
    all_readings = []

    try:
        for url in urls:
            links = get_reading_links(driver, url)
            readings = scrape_reading_content(driver, links)
            all_readings.extend(readings)
        save_to_csv(all_readings)
    finally:
        driver.quit()

if __name__ == '__main__':
    main()
