In [30]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [69]:
def scrape_internal_reading(driver, url_extension):
    # Load summary page
    driver.get(url_extension)
    time.sleep(2)

    # Get page source for summary page
    extended_page_source = driver.page_source
    ext_parsed_content = BeautifulSoup(extended_page_source, 'html.parser')

    # Extract topic name
    topic_name = ext_parsed_content.find('h1', class_='article-title').get_text()

    # Extract year
    try:
        year = ext_parsed_content.find('span', class_="content-utility-curriculum").get_text().strip()[:4]
    except Exception as e:
        print("Exception occurred during year extraction:", e)
        year = None

    # Extract level
    try:
        level = ext_parsed_content.find('span', class_="content-utility-topic").get_text().strip().strip("Level")
    except Exception as e:
        print("Exception occurred during level extraction:", e)
        level = None

    # Extract PDF link
    pdf_link=None
    
    try:
        pdf_section = ext_parsed_content.find('section', class_="primary-asset login-required")
        if pdf_section:
            for child_tag in pdf_section.find_all('a'):
                child_tag_text = child_tag.get_text()
                if child_tag_text == "Download the full reading (PDF)":
                    pdf_link = "https://www.cfainstitute.org" + child_tag['href']
        else:
            raise Exception("No PDF section available")
    except Exception as e:
        print("Exception occurred during PDF extraction:", e)
        pdf_link = None

    introduction = None
    learning_outcome = None
    summary = None
    
    # checking headings of Introduction, learning outcomes and summary
    for heading in ext_parsed_content.find_all('h2', class_="article-section"):
        heading_text = heading.get_text()
        
        # Extract introduction
        if(heading_text=='Introduction' or heading_text=='Overview'):
            try:
                introduction_section = heading.find_parent('section')
                intro_list=[]
                if introduction_section:
                    for child_tag in introduction_section.find_all('p'):
                        cleaned_string = re.sub(r'\s+', ' ', child_tag.get_text(strip=True)) 
                        intro_list.append(cleaned_string) 
                    introduction = " ".join(intro_list)
                else:
                    introduction=None
            except Exception as e:
                introduction=None
                print("Exception occurred during introduction extraction: ",e)
        
        # Extract learning outcome
        if(heading_text=="Learning Outcomes"):
            try:
                learning_outcome_section = heading.findNext('section')
                if learning_outcome_section:
                    learning_outcome_list = []
                    for child_tag in learning_outcome_section:
                        cleaned_string = re.sub(r'\s+', ' ', child_tag.get_text(strip=True))
                        learning_outcome_list.append(cleaned_string)
                    learning_outcome = " ".join(learning_outcome_list)
                else:
                    learning_outcome=None
            except Exception as e:
                learning_outcome=None
                print("Exception occurred during learning_outcome extraction: ",e)
            
        # Extract summary
        if(heading_text=='Summary'):
            try:
                summary_section = heading.findNext('div')
                if summary_section:
                    summary_list=[]
                    for child_tag in summary_section:
                        cleaned_string = re.sub(r'\s+', ' ', child_tag.get_text(strip=True))
                        summary_list.append(cleaned_string)
                    summary = " ".join(summary_list)
                else:
                    summary=None
            except Exception as e:
                summary=None
                print("Exception occurred during summary extraction: ",e)
            
    # print("Intro: ",introduction[:4], "lo: ",learning_outcome[:4], "summary: ", summary )
    
    return {'topic_name': topic_name, 'year': year, 'level': level, 'introduction': introduction,
            'learning_outcome': learning_outcome, 'summary':summary, 'summary_page_link': url_extension, 'pdf_file_Link': pdf_link}

In [70]:

def scrape_front_page_readings(base_url, pages):
    
    # Set up the Selenium webdrivers
    front_page_driver = webdriver.Chrome()
    internal_driver = webdriver.Chrome()

    # Initialize DataFrame to store scraped data
    cfa_readings_df = pd.DataFrame(columns=['topic_name', 'year', 'level', 'introduction', 'learning_outcome', 'summary', 'summary_page_link', 'pdf_file_Link'])

    # Iterate through pages
    for i in range(pages):

        print(f"Scraping page {i}")

        # Construct URL for pagination
        url = f"{base_url}#first={i}0&sort=%40refreadingcurriculumyear%20descending"

        # Load page
        front_page_driver.get(url)

        # Wait for dynamic content to load
        time.sleep(2)

        # Get page source
        page_source = front_page_driver.page_source

        # Parse content
        parsed_content = BeautifulSoup(page_source, 'html.parser')

        # Iterate through each div containing reading details
        for div in parsed_content.find_all('div', attrs={"class": "coveo-list-layout CoveoResult"}):
            try:
                # Extract summary page URL
                url_extension = div.find('a', class_='CoveoResultLink')["href"]

                # Scrape reading details
                reading_data = scrape_internal_reading(internal_driver, url_extension)

                # Add data to DataFrame
                cfa_readings_df.loc[len(cfa_readings_df)] = reading_data

            except Exception as e:
                print("Exception occurred during scraping:", e)

    # Close the Selenium webdrivers
    front_page_driver.quit()
    internal_driver.quit()

    return cfa_readings_df

In [72]:
# Define base URL and other constants
base_url = "https://www.cfainstitute.org/en/membership/professional-development/refresher-readings"
pages = 23

# Scrape reading details
cfa_readings_df = scrape_front_page_readings(base_url, pages)


Scraping page 0
Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Exception occurred during PDF extraction: No PDF section available
Scraping page 11
Exception occurred during PDF extraction: No PDF section available
Scraping page 12
Scraping page 13
Scraping page 14
Exception occurred during PDF extraction: No PDF section available
Exception occurred during year extraction: 'NoneType' object has no attribute 'get_text'
Exception occurred during PDF extraction: No PDF section available
Exception occurred during year extraction: 'NoneType' object has no attribute 'get_text'
Exception occurred during PDF extraction: No PDF section available
Exception occurred during year extraction: 'NoneType' object has no attribute 'get_text'
Exception occurred during PDF extraction: No PDF section available
Exception occurred during year extraction: 'NoneType' object has no attribute 'get_tex

Unnamed: 0,topic_name,year,level,introduction,learning_outcome,summary,summary_page_link,pdf_file_Link
0,Time-Series Analysis,2024,II,"As financial analysts, we often use time-seri...",The member should be able to: calculate and...,The predicted trend value of a time series in...,https://www.cfainstitute.org/membership/profes...,https://www.cfainstitute.org/-/media/documents...
1,Credit Analysis Models,2024,II,Credit analysis plays an important role in th...,The member should be able to: explain expec...,This reading has covered several important top...,https://www.cfainstitute.org/membership/profes...,https://www.cfainstitute.org/-/media/documents...
2,Introduction to Alternative Investments,2023,I,"In this section, we explain what alternative ...",The member should be able to: describe type...,This reading provides a comprehensive introduc...,https://www.cfainstitute.org/membership/profes...,https://www.cfainstitute.org/-/media/documents...
3,Credit Default Swaps,2024,II,Derivative instruments in which the underlyin...,The member should be able to: describe cred...,A credit default swap (CDS) is a contract betw...,https://www.cfainstitute.org/membership/profes...,https://www.cfainstitute.org/-/media/documents...
4,Valuation of Contingent Claims,2024,II,A contingent claim is a derivative instrument...,The member should be able to: describe and ...,This reading on the valuation of contingent cl...,https://www.cfainstitute.org/membership/profes...,https://www.cfainstitute.org/-/media/documents...


In [76]:
# Save DataFrame to CSV
cfa_readings_df.to_csv('scraped_data.csv', sep='\t', index=False)

In [73]:
cfa_readings_df.shape

(224, 8)

In [74]:
cfa_readings_df.isna().sum()

topic_name            0
year                  5
level                 0
introduction          6
learning_outcome     36
summary              63
summary_page_link     0
pdf_file_Link        21
dtype: int64