# DAMG 7245 : Assignment 2.1 
## Web Scraping and Dataset Creation

### Scraping 224 URL's from Finance Institute's Webpage

### Importing required packages

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import requests
import time
import csv
import os
import re

#### Loading environment Variables 

In [2]:
load_dotenv('../config/.env',override=True)

True

In [3]:
def loadenv():
    csv_filename = os.getenv("CSV_CFA_WEB")
    folderpath = os.getenv("DIR_CFA_WEB")
    txt_filename = os.getenv("TXT_CFA_LINKS")
    
    return csv_filename, folderpath, txt_filename

csv_filename, folderpath, txt_filename =loadenv()

In [4]:
csv_filename, folderpath, txt_filename

('FinanceHub.csv', '../sample_output/', '224_links.txt')

### Scrape function to extract links which comes as part of `CoveoResultLink` class under anchor tag in the given webpage and save those to a .txt file

- Strategy used involves edge browser webdriver to make the beutiful soup scraper wait for a certain amount of time before scraping as the links which needs to be extracted loads dynamically over the page after a particular Javascript snippet gets successfully executed.

In [5]:
def scrape_coveo_links(url):
    # Set up the WebDriver with the command line flag for Edge
    edge_options = webdriver.EdgeOptions()
    edge_options.add_argument('--enable-chrome-browser-cloud-management')

    driver = webdriver.Edge(options=edge_options)

    try:
        # Make a request using Selenium
        driver.get(url)

        # Wait for the dynamic content to load (you may need to adjust the sleep duration)
        time.sleep(5)

        # Get the page source after dynamic content has loaded
        page_source = driver.page_source

        # Parse the HTML content of the page
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find all the links with class 'coveo'
        coveo_links = soup.find_all('a', class_='CoveoResultLink')

        # Extract and write the href attribute of each coveo link to a file
        with open(folderpath+txt_filename, "a") as file:
            for link in coveo_links:
                href = link.get('href')
                if href:
                    file.write(href + '\n')

        # Print the total number of coveo links
        print(f"Total number of Coveo class links: {len(coveo_links)}")
        print("Coveo class links saved to '224_links.txt'")

    finally:
        # Close the WebDriver in a 'finally' block to ensure it is closed even if an exception occurs
        driver.quit()

### Looping to scrape links page by page taking `pagination` into account

In [6]:
count = 2

file_path = folderpath+txt_filename  # Replace with the actual file path
## Create/overwrite the file to empty it
try:
    # Open the file in write mode ('w') or truncate mode ('w+')
    with open(file_path, 'w+', encoding='utf-8'):
        pass  # The 'pass' statement does nothing, effectively emptying the file

    print(f"The file '{file_path}' has been emptied.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' does not exist.")
except Exception as e:
    print(f"An error occurred: {e}")

while(count>=0):
    url = f"https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first={count*100}&sort=%40refreadingcurriculumyear%20descending&numberOfResults=100"
    scrape_coveo_links(url)
    count = count - 1 

The file '../sample_output/224_links.txt' has been emptied.
Total number of Coveo class links: 24
Coveo class links saved to '224_links.txt'
Total number of Coveo class links: 100
Coveo class links saved to '224_links.txt'
Total number of Coveo class links: 100
Coveo class links saved to '224_links.txt'


### reading all of url saved in `../sample_output/224_links.txt`

In [7]:

# Open the file and read its contents
with open(file_path, 'r') as file:
    urls = [url.strip() for url in file.readlines()]
    
urls[:3]


['https://www.cfainstitute.org/membership/professional-development/refresher-readings/Organizational-Forms-Corporate-Issuer-Features-and-Ownership',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/Hedge-funds-L1',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/Introduction-to-Digital-Assets']

### Defining utility function like 

* clearing extra spaces, tabs, and newlines
* Identify heading tag 
* extract content from from `Introduction`, `Learning Outcomes`, `Summary` heading tags


In [8]:

# Function to clean up text by removing extra spaces, tabs, and newlines
def clean_text(text):
    # Remove extra spaces, tabs, and newlines using regular expressions
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

# Function to find and print section based on its title
def print_section(soup,title):
    # Find the section by its title
    section = soup.find('h2', text=title)
    try:
        if section:
            content = []
            next_node = section.find_next_sibling()
            while next_node and next_node.name != 'h2':
                text = next_node.get_text(" ", strip=True)
                cleaned_text = clean_text(text)
                if cleaned_text:  # Only add non-empty strings
                    content.append(cleaned_text)
                next_node = next_node.find_next_sibling()
            return "\n".join(content)
        else:
            return f"{title} section is missing."
    except:
        return f"{title} section is missing."

    
# URL of the webpage to scrape
def getContent(soup):
    # Parse the content of the page with BeautifulSoup\
    # Titles of the sections to extract
    try:
        text_final = []
        titles = ['Introduction', 'Learning Outcomes', 'Summary']
        # Loop through the titles and print each section
        for title in titles:
            text_final.append(print_section(soup,title))
        return text_final[0], text_final[1],text_final[2]
    except:
        return "","",""

### Identifying other relevant information using following methods

* **Topic Name** : Retriving the text in `<title>` tag which was earlier extracted from URL
* **Year, Level, Parent Topic** : Preprossing the text from content utility class using function `extract_info`
* **Categories** : Exploring `<p>` tag with `class='card-title'` and `text='Categories'`
* **Pdf_link** : Filtering a get which has text saying `Download the full reading (PDF)` and retriving the href associated with it 


In [9]:

def extract_information(url):
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the content of the page with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        # Parse the content of the page with BeautifulSoup
        # Initialize variables to store extracted information
        pdf_link = None
        topic_name = None
        year = None
        level = None
        categories = []
        text_final = None
        parent_topic = None

        # Extract the PDF download link
        ## PDF link extraction
        for a_tag in soup.find_all('a'):
            if 'Download the full reading (PDF)' in a_tag.text:
                pdf_link = "https://cfainstitute.org" + a_tag.get('href')
                break

        # Extract the topic name
        title_tag = soup.find('title')
        if title_tag:
            topic_name = title_tag.text

        # Extract the year and level from the content utility section
        content_utility = soup.find('div', class_='content-utility')
        if content_utility:
            text = content_utility.get_text()
            year, level, parent_topic = extract_info_year_level_topic(clean_text(text))    
        card_title = soup.find('p', class_='card-title', text='Categories')
        if card_title:
            # Find all <p> tags following the 'Categories' card title
            for sibling in card_title.find_next_siblings('p'):
                a_tag = sibling.find('a')
                if a_tag and a_tag.text.strip():
                    categories.append(a_tag.text.strip())

        Intro, LearningOutcome, Summary  = getContent(soup)

        return pdf_link, parent_topic, year, level, Intro, LearningOutcome, Summary , categories, topic_name
    else:
        return "","","","","","","","",""


def extract_info_year_level_topic(topic):
    parent_topics = [
        "Portfolio Management and Wealth Planning",
        "Fixed Income",
        "Corporate Finance",
        "Equity Investments",
        "Financial Reporting and Analysis",
        "Quantitative Methods",
        "Derivatives",
        "Alternative Investments",
        "Economics",
        "Ethical and Professional Standards"
    ]
    # Regex pattern for year, level, and parent topic
    pattern = r'(?P<year>20[0-2]\d) Curriculum CFA Program (?P<level>Level [I]{1,3}) (?P<topic>.+)'
    match = re.match(pattern, topic)

    if match:
        year = match.group('year')
        level = match.group('level')
        topic = match.group('topic')

        # Check if the extracted topic is in the predefined parent topics list
        for parent_topic in parent_topics:
            if parent_topic in topic:
                return str(year), str(level), str(parent_topic)
    return "","",""


def write_to_csv(file_path, header, content):
    # Check if the CSV file already exists
    file_exists = os.path.isfile(file_path)
    
    # Open the file in append mode if it exists, or write mode if it doesn't
    with open(file_path, mode='a' if file_exists else 'w', newline='') as file:
        writer = csv.writer(file)
        
        # If the file didn't exist, write the header first
        if not file_exists:
            writer.writerow(header)
        
        # Write the content to the CSV file
        writer.writerow(content)


# Define the path for the CSV file

def delCSV(csv_file_path):
    # Check if the CSV file exists
    if os.path.exists(csv_file_path):
        # If the file exists, remove it
        os.remove(csv_file_path)
        message = "CSV file removed successfully."
    else:
        message = "CSV file does not exist."
    

### Calling function to iterate over all the web pages to extract the information

* Looking for existing file
* deleting an existing file
* creating a new CSV 


In [10]:
%%time
from tqdm import tqdm

# deleting old files 
delCSV(folderpath+csv_filename)

# write to CSV 
for url in tqdm(urls[:]):
    write_to_csv(folderpath+csv_filename, ['pdf_link', 'Parent_topic', 'year', 'level', 'Introduction', 'LearningOutcome', 'Summary' , 'categories', 'topicName','url'],[*extract_information(url),url])

100%|█████████████████████████████████████████| 224/224 [01:44<00:00,  2.15it/s]

CPU times: user 14.9 s, sys: 603 ms, total: 15.5 s
Wall time: 1min 44s





In [11]:
urls[:3]

['https://www.cfainstitute.org/membership/professional-development/refresher-readings/Organizational-Forms-Corporate-Issuer-Features-and-Ownership',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/Hedge-funds-L1',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/Introduction-to-Digital-Assets']