In [None]:
%pip install selenium
%pip install webdriver-manager
%pip install beautifulsoup4

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import csv

In [11]:
def setup_driver():
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def get_reading_links(driver, url):
    driver.get(url)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    reading_links = [a['href'] for a in soup.select('a.CoveoResultLink')]
    return reading_links

def scrape_reading_content(driver, links):
    readings = []
    for link in links:
        driver.get(link)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        title = soup.find('h1').text.strip() if soup.find('h1') else 'Title Not Found'

        def extract_text_by_header(soup, header_texts):
            content = ""
            # Normalize header_texts to lowercase for case-insensitive comparison
            header_texts = [text.lower() for text in header_texts]
            headers = soup.find_all('h2', class_='article-section')
            for header in headers:
                if header.text.strip().lower() in header_texts:
                    current_element = header.find_next_sibling()
                    while current_element and current_element.name != 'h2':
                        if current_element.name in ['p', 'div', 'ol']:
                            content += '\n' + ' '.join(current_element.stripped_strings)
                        current_element = current_element.find_next_sibling()
                    if content:  # Break if content has been found
                        break
            return content.strip()

        def extract_learning_outcomes(soup):
            content = ""
            header = soup.find('h2', text=lambda t: "Learning Outcomes" in t, class_='article-section')
            if header:
                section = header.find_next_sibling('section')
                if section:
                    content = ' '.join(section.stripped_strings)
            return content.strip()


        
        introduction = extract_text_by_header(soup, ['Introduction', 'Overview', 'INTRODUCTION'])
        learning_outcomes = extract_learning_outcomes(soup)
        summary = extract_text_by_header(soup, ['Summary'])

        year = soup.find("span", class_="content-utility-curriculum").text.strip() if soup.find("span", class_="content-utility-curriculum") else "Year Not Found"
        level = soup.find("span", class_="content-utility-level").text.strip() if soup.find("span", class_="content-utility-level") else "Level Not Found"
        link_to_full_pdf = soup.find("a", class_="locked-content")["href"].strip() if soup.find("a", class_="locked-content") else "Link Not Found"

        readings.append({
            'Name of the topic': title,
            'Year': year,
            'Level' : level,
            'Introduction': introduction,
            'Learning Outcomes': learning_outcomes,
            'Summary': summary,
            'Link to the Summary Page': link,
            'Link to the PDF File': link_to_full_pdf
        })

    return readings


def save_to_csv(readings, filename="Team05.csv"):
    keys = readings[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(readings)


from selenium.common.exceptions import NoSuchElementException, JavascriptException

def setup_driver():
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def main():
    driver = setup_driver()
    urls = ['https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#sort=%40refreadingcurriculumyear%20descending&numberOfResults=100',
            'https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first=100&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100',
            'https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first=200&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100',]
    all_readings = []  
    
    try:
        for url in urls:
            links = get_reading_links(driver, url)
            readings = scrape_reading_content(driver, links)
            all_readings.extend(readings)  
        save_to_csv(all_readings)  
    finally:
        driver.quit()

if __name__ == '__main__':
    main()


