In [1]:
from bs4 import BeautifulSoup as bs
from datetime import datetime
import logging
from IPython.display import display, Markdown
import pandas as pd
import requests
from selenium import webdriver
from typing import List, Tuple
import time

In [2]:
# Configure logging
file_name = "..\..\logs\scrape-log\webscrapping.log"
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
if not logger.handlers:
    handler = logging.FileHandler(file_name)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

# display log message in-line 
class NotebookHandler(logging.Handler):
    def emit(self, record):
        display(Markdown(self.format(record)))
        
logger.addHandler(NotebookHandler())

# set the file to empty at start
with open(file_name, 'w'):
    pass

In [3]:
# test the URL connection
def testConnection():
    try:
        logger.info('Testing URL connection for Web Scrapping')
        url_test = "https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first={first}&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100".format(first=0)
        print("URL: ",url_test)
        response_test = requests.get(url_test)
        print(response_test)
        logger.info('Successfully connected to the site')
        return True
    except:
        logger.error("Error in loading the URL")
        return False

In [4]:
# function to return individual page Links
def getPageURLs() -> list:
    # base URL
    base_url = "https://www.cfainstitute.org/en/membership/professional-development/refresher-readings"
    
    # additional section for the URL to specify the first row and the number of results
    additional_section = "#first={first}&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100"
    
    individual_topics_urls = [] # store individual page URLs (sub pages to collect data)
    first_entry = 0 # start of the entries on web page
    has_more_pages = True # flag to keep scrapping
    count = 0

    # Set up Selenium WebDriver to scrape dynamic data
    driver = webdriver.Chrome()

    logger.info("------- Starting URL Extraction -------")
    while has_more_pages:
        try:
            # increment counter
            count += 1
        
            # Construct URL to Scrape Data
            url = base_url + additional_section.format(first=first_entry)  
        
            # load the Web Page that has Dyanmic content with help of selenium 
            driver.get(url)

            # Sleep for 5 seconds for page to load
            time.sleep(5)
            
            # get the source HTML code
            page = driver.page_source
        
            # use BeautifulSoup to load pase the content
            parsed_content = bs(page, 'html.parser')
        
            # find all the individual 224 entries and get the URL
            for parse in parsed_content.find_all("div", {"class": "coveo-list-layout"}):
                a_tag = parse.find('a') # get the <a> tags
                individual_topics_urls.append((a_tag.text, a_tag.get('href')))
    
            # Find the Next page button on the page
            next_button = parsed_content.find_all("li", {"class": "coveo-pager-next"})
    
            # if next button is present go to next page 
            if next_button:
                first_entry += 100
            else:
                has_more_pages = False
                

            
            logger.info("Total topics found after iteration {count} iterations are {length}".format(count=count, length=len(individual_topics_urls)))
            
        except Exception as e:
            logger.error("Exception:", e)
            break
            
    # Close the Selenium WebDriver
    driver.quit()
    
    logger.info("------- Ending URL Extraction -------")
    return individual_topics_urls    

In [5]:
# function to scrape Data from CFA site (Raw Data)
def extractData(individual_topics_urls: List[Tuple[str, str]]):
    
    logger.info("------- Starting Data Extraction -------")
    
    # Set up Selenium WebDriver to scrape dynamic data
    driver = webdriver.Chrome()
    
    # initialize data frame
    raw_df = pd.DataFrame(columns=['NameOfTheTopic', 'Year', 'Level', 'IntroductionSummary', 'LearningOutcomes', 'SummaryPageLink', 'PDFFileLink'])

    # iteration counter
    counter = 0
    
    for topic, topic_url in individual_topics_urls:
        try:
            # load the Web Page that has Dyanmic content with help of selenium 
            driver.get(topic_url)
            
            # Sleep for 2 seconds for page to laod
            time.sleep(2)
            
            # reset temp
            temp = {}

            # increment counter
            counter += 1
            
            # get the source HTML code
            page = driver.page_source

            # use BeautifulSoup to load pase the content
            parsed_content = bs(page, 'html.parser')

            # extract title
            try:
                title = parsed_content.find("h1", {"class": "article-title"}).text.strip().strip("\u200b")
            except:
                logger.error("Iteration: {} topic: {} -> title not found".format(counter, topic))
                title = None

            # extract Year and Level
            try:
                content_utility = parsed_content.find("div", {"class": "content-utility"})
                try: 
                    year = content_utility.find('span', {"class": "content-utility-curriculum"}).text.strip().split('\n')[0]
                except:
                    logger.error("Iteration: {} topic: {} -> content-utility-curriculum (Year) not found".format(counter, topic))
                    year = None
                try:
                    level = content_utility.find("span", {"class": "content-utility-topic"}).text.strip().strip("Level ")
                except:
                    logger.error("Iteration: {} topic: {} -> content-utility-topic (level) not found".format(counter, topic))
                    level = None
            except:
                logger.error("Iteration: {} topic: {} -> content_utility (Year and Level) not found".format(counter, topic))
                year = None
                level = None

            # extract Introduction Summary and Learning Outcomes
            try:
                h2_elements = parsed_content.find_all("h2", {"class": "article-section"})

                # extract Introduction
                try:
                    introductions = h2_elements[0].parent.find_all("p")
                    intro = ""
                    for introduction in introductions:
                        intro += introduction.text.strip()
                    intro.strip()
                    
                    points = h2_elements[0].parent.find_all("li")
                    intro_points = ''
                    for point in points:
                        intro_points += point.text
                    intro = intro + " " + intro_points
                except:
                    logger.error("Iteration: {} topic: {} -> Introduction not found".format(counter, topic))
                    intro = None

                # extract Learning Outcomes
                try:    
                    learning_outcomes_paras = h2_elements[1].find_next_sibling().find_all("p") 
                    learning_outcomes = h2_elements[1].find_next_sibling().find_all("li") 
                    learnings = ''
                    for learning_outcomes_para in learning_outcomes_paras:
                        if learning_outcomes_para.text:
                            learnings += " " + learning_outcomes_para.text.strip()
                        
                    for learning_outcome in learning_outcomes:
                        if learning_outcome.text:
                            learnings += learning_outcome.text
                    
                    learning_list = [s.strip() for s in learnings.split("\n")]
                    learnings = ''.join(learning_list)
                except:
                    logger.error("Iteration: {} topic: {} -> Learning Outcomes not found".format(counter, topic))
                    learnings = None
            except:
                logger.error("Iteration: {} topic: {} -> Introduction and Learning Outcomes not found".format(counter, topic))
                learnings = None
                intro = None
              
            # extrat PDF File Link
            try:
                lock_content = parsed_content.find("section", {"class": "primary-asset login-required"})
                pdf_link = [a for a in lock_content.find_all('a', {"class": "locked-content"}) if a.text.strip()=='Download the full reading (PDF)'][0].get("href")  
                pdf_link = 'https://www.cfainstitute.org' + pdf_link
            except:
                logger.error("Iteration: {} topic: {} -> PDF Link not found".format(counter, topic))
                pdf_link = None

            temp['NameOfTheTopic'] = title
            temp['Year'] = year
            temp['Level'] = level
            temp['IntroductionSummary'] = intro
            temp['LearningOutcomes'] = learnings
            temp['PDFFileLink'] = pdf_link
            temp['SummaryPageLink'] = topic_url
            raw_df = pd.concat([raw_df, pd.DataFrame([temp])], ignore_index=True)
        
        except:
            logger.error("Iteration: {} topic: {} -> Unknown Error".format(counter, topic))
            
    # Close the Selenium WebDriver
    driver.quit()
    
    logger.info("------- Ending Data Extraction -------")
    
    return raw_df
    

In [6]:
# function to store DF locally
def convertDFtoCSV(df):
    try:
        logger.info("-------Starting Writing to CSV -------")
        csv_location = "..\..\data\scrape-data\cfa-data.csv"
        df.to_csv(csv_location, index=False)
        logger.info("------- Ending Writing to CSV -------")
    except:
        logger.error("Error in writing to CSV")

In [7]:
if __name__ == "__main__":
    # test connectuion
    testConnection()
    # get topic URL list
    individual_topics_urls = getPageURLs()
    # extract data in form of DF
    df = extractData(individual_topics_urls)
    # convert DF to CSV file
    convertDFtoCSV(df)

Testing URL connection for Web Scrapping

URL:  https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first=0&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100
<Response [200]>


Successfully connected to the site

------- Starting URL Extraction -------

Total topics found after iteration 1 iterations are 100

Total topics found after iteration 2 iterations are 200

Total topics found after iteration 3 iterations are 224

------- Ending URL Extraction -------

------- Starting Data Extraction -------

Iteration: 110 topic: Refresher Reading 02-18-2021  -> PDF Link not found

Iteration: 111 topic: Refresher Reading 11-13-2020  -> PDF Link not found

Iteration: 141 topic: ICE RSS FEED Active Equity Investing: Strategies  -> PDF Link not found

Iteration: 142 topic: test  -> content-utility-curriculum (Year) not found

Iteration: 142 topic: test  -> PDF Link not found

Iteration: 143 topic: TEST RR  -> content-utility-curriculum (Year) not found

Iteration: 143 topic: TEST RR  -> PDF Link not found

Iteration: 144 topic: QA Test RR 1  -> content-utility-curriculum (Year) not found

Iteration: 144 topic: QA Test RR 1  -> PDF Link not found

Iteration: 145 topic: Branded Image Link Added to Refresher Reading  -> content-utility-curriculum (Year) not found

Iteration: 145 topic: Branded Image Link Added to Refresher Reading  -> PDF Link not found

Iteration: 146 topic: Big Data Projects  -> PDF Link not found

Iteration: 147 topic: Big Data Projects  -> PDF Link not found

Iteration: 148 topic: Test 20 Cost of Capital  -> PDF Link not found

Iteration: 151 topic: Big Data Projects  -> PDF Link not found

Iteration: 152 topic: Industry and Company Analysis (2018)  -> PDF Link not found

Iteration: 153 topic: Discounted Dividend Valuation (2018)   -> PDF Link not found

Iteration: 154 topic: The Portfolio Management Process and the Investment Policy Statement (2018)  -> PDF Link not found

Iteration: 155 topic: Employee Compensation: Post-Employment and Share-Based (2018)   -> PDF Link not found

Iteration: 156 topic: Integration of Financial Statement Analysis Techniques (2018)  -> PDF Link not found

Iteration: 157 topic: Free Cash Flow Valuation (2018)  -> PDF Link not found

Iteration: 158 topic: Market-Based Valuation: Price and Enterprise Value Multiples (2018)   -> PDF Link not found

Iteration: 159 topic: Residual Income Valuation (2018)   -> PDF Link not found

Iteration: 160 topic: Yield-Based Bond Convexity and Portfolio Properties  -> Learning Outcomes not found

Iteration: 165 topic: Interest Rate Risk and Return   -> Learning Outcomes not found

Iteration: 166 topic: Yield-Based Bond Duration Measures and Properties  -> Learning Outcomes not found

Iteration: 168 topic: Yield and Yield Spread Measures for Floating-Rate Instruments  -> Learning Outcomes not found

Iteration: 170 topic: Capital Investments and Capital Allocation  -> Learning Outcomes not found

Iteration: 172 topic: Sampling and Estimation  -> content-utility-curriculum (Year) not found

Iteration: 172 topic: Sampling and Estimation  -> PDF Link not found

Iteration: 180 topic: Fixed-Income Markets for Government Issuers  -> Learning Outcomes not found

Iteration: 183 topic: Case Study in Risk Management: Institutional  -> Learning Outcomes not found

Iteration: 184 topic: Hedge Funds  -> Learning Outcomes not found

Iteration: 188 topic: Credit Analysis for Government Issuers  -> Learning Outcomes not found

Iteration: 189 topic: Asset-Backed Security (ABS) Instrument and Market Features  -> Learning Outcomes not found

Iteration: 190 topic: Fixed-Income Bond Valuation: Prices and Yields  -> Learning Outcomes not found

Iteration: 191 topic: The Term Structure of Interest Rates: Spot, Par, and Forward Curves  -> Learning Outcomes not found

Iteration: 192 topic: Alternative Investment Performance and Returns  -> Learning Outcomes not found

Iteration: 193 topic: Investments in Private Capital: Equity & Debt  -> Learning Outcomes not found

Iteration: 194 topic: Credit Risk  -> Learning Outcomes not found

Iteration: 195 topic: Fixed-Income Securitization  -> Learning Outcomes not found

Iteration: 196 topic: Investors and Other Stakeholders  -> Learning Outcomes not found

Iteration: 197 topic: Yield and Yield Spread Measures for Fixed-Rate Bonds  -> Learning Outcomes not found

Iteration: 199 topic: Alternative Investment Features, Methods, and Structures  -> Learning Outcomes not found

Iteration: 200 topic: Curve-Based and Empirical Fixed-Income Risk Measures  -> Learning Outcomes not found

Iteration: 201 topic: Organizational Forms, Corporate Issuer Features, and Ownership  -> Learning Outcomes not found

Iteration: 202 topic: Hedge Funds  -> Learning Outcomes not found

Iteration: 203 topic: Introduction to Digital Assets  -> Learning Outcomes not found

Iteration: 204 topic: Fixed-Income Instrument Features  -> Learning Outcomes not found

Iteration: 205 topic: Multinational Operations (2018)  -> PDF Link not found

Iteration: 206 topic: Natural Resources  -> Learning Outcomes not found

Iteration: 207 topic: Real Estate & Infrastructure  -> Learning Outcomes not found

Iteration: 208 topic: Credit Analysis for Corporate Issuers  -> Learning Outcomes not found

Iteration: 209 topic: Mortgage-Backed Security (MBS) Instrument and Market Features  -> Learning Outcomes not found

Iteration: 210 topic: Fixed-Income Issuance and Trading  -> Learning Outcomes not found

Iteration: 211 topic: Industry and Competitive Analysis  -> Learning Outcomes not found

Iteration: 212 topic: Company Analysis: Forecasting  -> Learning Outcomes not found

Iteration: 216 topic: Company Analysis: Past and Present  -> Learning Outcomes not found

Iteration: 217 topic: Business Models  -> Learning Outcomes not found

Iteration: 218 topic: Fixed-Income Markets for Corporate Issuers  -> Learning Outcomes not found

Iteration: 219 topic: Corporate Governance: Conflicts, Mechanisms, Risks, and Benefits  -> Learning Outcomes not found

Iteration: 220 topic: Fixed-Income Cash Flows and Types  -> Learning Outcomes not found

------- Ending Data Extraction -------

-------Starting Writing to CSV -------

------- Ending Writing to CSV -------