In [10]:
from bs4 import BeautifulSoup as bs
from datetime import datetime
import logging
from IPython.display import display, Markdown
import pandas as pd
import requests
from selenium import webdriver
from typing import List, Tuple

In [8]:
# Configure logging
file_name = ".\logs\webscrapping.log"
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
if not logger.handlers:
    handler = logging.FileHandler(file_name)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

# display log message in-line 
class NotebookHandler(logging.Handler):
    def emit(self, record):
        display(Markdown(self.format(record)))
        
logger.addHandler(NotebookHandler())

# set the file to empty at start
with open(file_name, 'w'):
    pass

In [3]:
# test the URL connection
try:
    logger.info('Testing URL connection for Web Scrapping')
    url_test = "https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first={first}&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100".format(first=0)
    print("URL: ",url_test)
    response_test = requests.get(url_test)
    print(response_test)
    logger.info('Successfully connected to the site')
except:
    logger.error("Error in loading the URL")

Testing URL connection for Web Scrapping

URL:  https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first=0&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100
<Response [200]>


Successfully connected to the site

In [4]:
# function to return individual page Links
def getPageURLs() -> list:
    # base URL
    base_url = "https://www.cfainstitute.org/en/membership/professional-development/refresher-readings"
    
    # additional section for the URL to specify the first row and the number of results
    additional_section = "#first={first}&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100"
    
    individual_topics_urls = [] # store individual page URLs (sub pages to collect data)
    first_entry = 0 # start of the entries on web page
    has_more_pages = True # flag to keep scrapping
    count = 0
    logger.info("------- Starting URL Extraction -------")
    while has_more_pages:
        try:
            # increment counter
            count += 1
            
            # Set up Selenium WebDriver to scrape dynamic data
            driver = webdriver.Chrome()
        
            # Construct URL to Scrape Data
            url = base_url + additional_section.format(first=first_entry)  
        
            # load the Web Page that has Dyanmic content with help of selenium 
            driver.get(url)
        
            # get the source HTML code
            page = driver.page_source
        
            # use BeautifulSoup to load pase the content
            parsed_content = bs(page, 'html.parser')
        
            # find all the individual 224 entries and get the URL
            for parse in parsed_content.find_all("div", {"class": "coveo-list-layout"}):
                a_tag = parse.find('a') # get the <a> tags
                individual_topics_urls.append((a_tag.text, a_tag.get('href')))
    
            # Find the Next page button on the page
            next_button = parsed_content.find_all("li", {"class": "coveo-pager-next"})
    
            # if next button is present go to next page 
            if next_button:
                first_entry += 100
            else:
                has_more_pages = False
                
            # Close the Selenium WebDriver
            driver.quit()
            
            logger.info("Total topics found after iteration {count} iterations are {length}".format(count=count, length=len(individual_topics_urls)))
            
        except Exception as e:
            logger.error("Exception:", e)
            break

    logger.info("------- Ending URL Extraction -------")
    return individual_topics_urls    

In [5]:
li = getPageURLs()

------- Starting URL Extraction -------

Total topics found after iteration 1 iterations are 100

Total topics found after iteration 2 iterations are 200

Total topics found after iteration 3 iterations are 224

------- Ending URL Extraction -------

In [15]:
def extractData(individual_topics_urls: List[Tuple[str, str]]):
    print(1)

In [None]:
# function to scrape Data from CFA site and return the HTML contents (Raw Data)
def extractHTML():
    individual_topics_urls = getPageURLs()

In [14]:
logger.error("Error in loading the URL")

In [9]:
li

[('Time-Series Analysis ',
  'https://www.cfainstitute.org/membership/professional-development/refresher-readings/time-series-analysis'),
 ('Credit Analysis Models ',
  'https://www.cfainstitute.org/membership/professional-development/refresher-readings/credit-analysis-models'),
 ('Introduction to Alternative Investments ',
  'https://www.cfainstitute.org/membership/professional-development/refresher-readings/introduction-alternative-investments'),
 ('Credit Default Swaps ',
  'https://www.cfainstitute.org/membership/professional-development/refresher-readings/credit-default-swaps'),
 ('Valuation of Contingent Claims ',
  'https://www.cfainstitute.org/membership/professional-development/refresher-readings/valuation-contingent-claims'),
 ('Introduction to Commodities and Commodity Derivatives ',
  'https://www.cfainstitute.org/membership/professional-development/refresher-readings/introduction-commodities-commodity-derivatives'),
 ('Understanding Income Statements ',
  'https://www.cfai