Questions - 
1. what is year? - 2024 Curriculum or just 2024?
2. Introduction Summary? - is it entire summary section or do we need to use some library to summarize the para
3. Learning 
Outcome? - do we need it with the format, or just the text
4. Link to the Summary Page? - whis is this could not find it on the site
5. what if the page has no necessary contents

In [1]:
from bs4 import BeautifulSoup as bs
from datetime import datetime
import logging
from IPython.display import display, Markdown
import pandas as pd
import requests
from selenium import webdriver
from typing import List, Tuple

In [2]:
# Configure logging
file_name = ".\logs\webscrapping.log"
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
if not logger.handlers:
    handler = logging.FileHandler(file_name)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

# display log message in-line 
class NotebookHandler(logging.Handler):
    def emit(self, record):
        display(Markdown(self.format(record)))
        
logger.addHandler(NotebookHandler())

# set the file to empty at start
with open(file_name, 'w'):
    pass

In [3]:
# test the URL connection
def testConnection():
    try:
        logger.info('Testing URL connection for Web Scrapping')
        url_test = "https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first={first}&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100".format(first=0)
        print("URL: ",url_test)
        response_test = requests.get(url_test)
        print(response_test)
        logger.info('Successfully connected to the site')
        return True
    except:
        logger.error("Error in loading the URL")
        return False

In [4]:
# function to return individual page Links
def getPageURLs() -> list:
    # base URL
    base_url = "https://www.cfainstitute.org/en/membership/professional-development/refresher-readings"
    
    # additional section for the URL to specify the first row and the number of results
    additional_section = "#first={first}&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100"
    
    individual_topics_urls = [] # store individual page URLs (sub pages to collect data)
    first_entry = 0 # start of the entries on web page
    has_more_pages = True # flag to keep scrapping
    count = 0
    logger.info("------- Starting URL Extraction -------")
    while has_more_pages:
        try:
            # increment counter
            count += 1
            
            # Set up Selenium WebDriver to scrape dynamic data
            driver = webdriver.Chrome()
        
            # Construct URL to Scrape Data
            url = base_url + additional_section.format(first=first_entry)  
        
            # load the Web Page that has Dyanmic content with help of selenium 
            driver.get(url)
        
            # get the source HTML code
            page = driver.page_source
        
            # use BeautifulSoup to load pase the content
            parsed_content = bs(page, 'html.parser')
        
            # find all the individual 224 entries and get the URL
            for parse in parsed_content.find_all("div", {"class": "coveo-list-layout"}):
                a_tag = parse.find('a') # get the <a> tags
                individual_topics_urls.append((a_tag.text, a_tag.get('href')))
    
            # Find the Next page button on the page
            next_button = parsed_content.find_all("li", {"class": "coveo-pager-next"})
    
            # if next button is present go to next page 
            if next_button:
                first_entry += 100
            else:
                has_more_pages = False
                
            # Close the Selenium WebDriver
            driver.quit()
            
            logger.info("Total topics found after iteration {count} iterations are {length}".format(count=count, length=len(individual_topics_urls)))
            
        except Exception as e:
            logger.error("Exception:", e)
            break

    logger.info("------- Ending URL Extraction -------")
    return individual_topics_urls    

In [5]:
# function to scrape Data from CFA site (Raw Data)
def extractData(individual_topics_urls: List[Tuple[str, str]]):
    
    logger.info("------- Starting Data Extraction -------")
    
    # Set up Selenium WebDriver to scrape dynamic data
    driver = webdriver.Chrome()
    
    # initialize data frame
    raw_df = pd.DataFrame(columns=['NameOfTheTopic', 'Year', 'Level', 'IntroductionSummary', 'LearningOutcomes', 'SummaryPageLink', 'PDFFileLink'])

    # iteration counter
    counter = 0
    
    for topic, topic_url in individual_topics_urls:
        try:
            # load the Web Page that has Dyanmic content with help of selenium 
            driver.get(topic_url)

            # reset temp
            temp = {}

            # increment counter
            counter += 1
            
            # get the source HTML code
            page = driver.page_source

            # use BeautifulSoup to load pase the content
            parsed_content = bs(page, 'html.parser')

            # extract title
            try:
                title = parsed_content.find("h1", {"class": "article-title"}).text.strip().strip("\u200b")
            except:
                logger.error("Iteration: {} topic: {} -> title not found".format(counter, topic))
                title = None

            # extract Year and Level
            try:
                content_utility = parsed_content.find("div", {"class": "content-utility"})
                try: 
                    year = content_utility.find('span', {"class": "content-utility-curriculum"}).text.strip().split('\n')[0]
                except:
                    logger.error("Iteration: {} topic: {} -> content-utility-curriculum (Year) not found".format(counter, topic))
                    year = None
                try:
                    level = content_utility.find("span", {"class": "content-utility-topic"}).text.strip()
                except:
                    logger.error("Iteration: {} topic: {} -> content-utility-topic (level) not found".format(counter, topic))
                    level = None
            except:
                logger.error("Iteration: {} topic: {} -> content_utility (Year and Level) not found".format(counter, topic))
                year = None
                level = None

            # extract Introduction Summary and Learning Outcomes
            try:
                h2_elements = parsed_content.find_all("h2", {"class": "article-section"})
    
                # h2_elements[0] ---> for introduction
        
                learning_outcomes = h2_elements[1].find_next_sibling().find_all("li") 
                learnings = ''
                for learning_outcome in learning_outcomes:
                    if learning_outcome.text:
                        learnings += learning_outcome.text
                    else: 
                        learnings += learning_outcome.find('p').text.strip()
                
                learning_list = [s.strip() for s in learnings.split("\n")]
                learnings = ''.join(learning_list)
            except:
                logger.error("Iteration: {} topic: {} -> Learning Outcomes not found".format(counter, topic))
                learnings = None
            
            # extrat PDF File Link
            try:
                lock_content = parsed_content.find("section", {"class": "primary-asset login-required"})
                pdf_link = [a for a in lock_content.find_all('a', {"class": "locked-content"}) if a.text.strip()=='Download the full reading (PDF)'][0].get("href")  
                pdf_link = 'https://www.cfainstitute.org' + pdf_link
            except:
                logger.error("Iteration: {} topic: {} -> Learning Outcomes not found".format(counter, topic))
                pdf_link = None

            temp['NameOfTheTopic'] = title
            temp['Year'] = year
            temp['Level'] = level
            temp['LearningOutcomes'] = learnings
            temp['PDFFileLink'] = pdf_link     
            raw_df = pd.concat([raw_df, pd.DataFrame([temp])], ignore_index=True)
        
        except:
            logger.error("Iteration: {} topic: {} -> Unknown Error".format(counter, topic))
            
    # Close the Selenium WebDriver
    driver.quit()
    
    logger.info("------- Ending Data Extraction -------")
    
    return raw_df
    

In [6]:
if __name__ == "__main__":
    individual_topics_urls = getPageURLs()
    df = extractData(individual_topics_urls)

------- Starting URL Extraction -------

Total topics found after iteration 1 iterations are 100

Total topics found after iteration 2 iterations are 200

Total topics found after iteration 3 iterations are 224

------- Ending URL Extraction -------

------- Starting Data Extraction -------

Error: Iteration 110 topic Refresher Reading 02-18-2021  -> Learning Outcomes not found

Error: Iteration 111 topic Refresher Reading 11-13-2020  -> Learning Outcomes not found

Error: Iteration 141 topic ICE RSS FEED Active Equity Investing: Strategies  -> Learning Outcomes not found

Error: Iteration 142 topic test  -> content-utility-curriculum (Year) not found

Error: Iteration 142 topic test  -> Learning Outcomes not found

Error: Iteration 143 topic TEST RR  -> content-utility-curriculum (Year) not found

Error: Iteration 143 topic TEST RR  -> Learning Outcomes not found

Error: Iteration 144 topic QA Test RR 1  -> content-utility-curriculum (Year) not found

Error: Iteration 144 topic QA Test RR 1  -> Learning Outcomes not found

Error: Iteration 145 topic Branded Image Link Added to Refresher Reading  -> content-utility-curriculum (Year) not found

Error: Iteration 145 topic Branded Image Link Added to Refresher Reading  -> Learning Outcomes not found

Error: Iteration 146 topic Big Data Projects  -> Learning Outcomes not found

Error: Iteration 147 topic Big Data Projects  -> Learning Outcomes not found

Error: Iteration 148 topic Test 20 Cost of Capital  -> Learning Outcomes not found

Error: Iteration 151 topic Big Data Projects  -> Learning Outcomes not found

Error: Iteration 152 topic Industry and Company Analysis (2018)  -> Learning Outcomes not found

Error: Iteration 153 topic Discounted Dividend Valuation (2018)   -> Learning Outcomes not found

Error: Iteration 154 topic The Portfolio Management Process and the Investment Policy Statement (2018)  -> Learning Outcomes not found

Error: Iteration 155 topic Employee Compensation: Post-Employment and Share-Based (2018)   -> Learning Outcomes not found

Error: Iteration 156 topic Integration of Financial Statement Analysis Techniques (2018)  -> Learning Outcomes not found

Error: Iteration 157 topic Free Cash Flow Valuation (2018)  -> Learning Outcomes not found

Error: Iteration 158 topic Market-Based Valuation: Price and Enterprise Value Multiples (2018)   -> Learning Outcomes not found

Error: Iteration 159 topic Residual Income Valuation (2018)   -> Learning Outcomes not found

Error: Iteration 160 topic Yield-Based Bond Convexity and Portfolio Properties  -> Learning Outcomes not found

Error: Iteration 165 topic Interest Rate Risk and Return   -> Learning Outcomes not found

Error: Iteration 166 topic Yield-Based Bond Duration Measures and Properties  -> Learning Outcomes not found

Error: Iteration 168 topic Yield and Yield Spread Measures for Floating-Rate Instruments  -> Learning Outcomes not found

Error: Iteration 170 topic Capital Investments and Capital Allocation  -> Learning Outcomes not found

Error: Iteration 172 topic Sampling and Estimation  -> content-utility-curriculum (Year) not found

Error: Iteration 172 topic Sampling and Estimation  -> Learning Outcomes not found

Error: Iteration 180 topic Fixed-Income Markets for Government Issuers  -> Learning Outcomes not found

Error: Iteration 183 topic Case Study in Risk Management: Institutional  -> Learning Outcomes not found

Error: Iteration 184 topic Hedge Funds  -> Learning Outcomes not found

Error: Iteration 188 topic Credit Analysis for Government Issuers  -> Learning Outcomes not found

Error: Iteration 189 topic Asset-Backed Security (ABS) Instrument and Market Features  -> Learning Outcomes not found

Error: Iteration 190 topic Fixed-Income Bond Valuation: Prices and Yields  -> Learning Outcomes not found

Error: Iteration 191 topic The Term Structure of Interest Rates: Spot, Par, and Forward Curves  -> Learning Outcomes not found

Error: Iteration 192 topic Alternative Investment Performance and Returns  -> Learning Outcomes not found

Error: Iteration 193 topic Investments in Private Capital: Equity & Debt  -> Learning Outcomes not found

Error: Iteration 194 topic Credit Risk  -> Learning Outcomes not found

Error: Iteration 195 topic Fixed-Income Securitization  -> Learning Outcomes not found

Error: Iteration 196 topic Investors and Other Stakeholders  -> Learning Outcomes not found

Error: Iteration 197 topic Yield and Yield Spread Measures for Fixed-Rate Bonds  -> Learning Outcomes not found

Error: Iteration 199 topic Alternative Investment Features, Methods, and Structures  -> Learning Outcomes not found

Error: Iteration 200 topic Curve-Based and Empirical Fixed-Income Risk Measures  -> Learning Outcomes not found

Error: Iteration 201 topic Organizational Forms, Corporate Issuer Features, and Ownership  -> Learning Outcomes not found

Error: Iteration 202 topic Hedge Funds  -> Learning Outcomes not found

Error: Iteration 203 topic Introduction to Digital Assets  -> Learning Outcomes not found

Error: Iteration 204 topic Fixed-Income Instrument Features  -> Learning Outcomes not found

Error: Iteration 205 topic Multinational Operations (2018)  -> Learning Outcomes not found

Error: Iteration 206 topic Natural Resources  -> Learning Outcomes not found

Error: Iteration 207 topic Real Estate & Infrastructure  -> Learning Outcomes not found

Error: Iteration 208 topic Credit Analysis for Corporate Issuers  -> Learning Outcomes not found

Error: Iteration 209 topic Mortgage-Backed Security (MBS) Instrument and Market Features  -> Learning Outcomes not found

Error: Iteration 210 topic Fixed-Income Issuance and Trading  -> Learning Outcomes not found

Error: Iteration 211 topic Industry and Competitive Analysis  -> Learning Outcomes not found

Error: Iteration 212 topic Company Analysis: Forecasting  -> Learning Outcomes not found

Error: Iteration 216 topic Company Analysis: Past and Present  -> Learning Outcomes not found

Error: Iteration 217 topic Business Models  -> Learning Outcomes not found

Error: Iteration 218 topic Fixed-Income Markets for Corporate Issuers  -> Learning Outcomes not found

Error: Iteration 219 topic Corporate Governance: Conflicts, Mechanisms, Risks, and Benefits  -> Learning Outcomes not found

Error: Iteration 220 topic Fixed-Income Cash Flows and Types  -> Learning Outcomes not found

------- Ending Data Extraction -------

In [7]:
individual_topics_urls[109]

('Refresher Reading 02-18-2021 ',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/2020/Refresher-Reading-02-18-2021')

In [10]:
len(df)

224