In [12]:
!pip install requests_html



In [13]:
!pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply()



In [14]:
urls = ['https://www.cfainstitute.org/membership/professional-development/refresher-readings/time-series-analysis',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/credit-analysis-models',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/introduction-alternative-investments',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/credit-default-swaps',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/valuation-contingent-claims',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/introduction-commodities-commodity-derivatives',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/understanding-income-statements',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/pricing-and-valuation-of-forward-commitments',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/private-equity-investments',
 'https://www.cfainstitute.org/membership/professional-development/refresher-readings/valuation-analysis-bonds-embedded-options']

In [15]:
import requests
from bs4 import BeautifulSoup
import re

# URL of the webpage you want to scrape
def getContent(url):
    # Send a GET request to the webpage
    response = requests.get(url)

    # Function to clean up text by removing extra spaces, tabs, and newlines
    def clean_text(text):
        # Remove extra spaces, tabs, and newlines using regular expressions
        cleaned_text = re.sub(r'\s+', ' ', text)
        return cleaned_text.strip()

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the content of the page with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Function to find and print section based on its title
        def print_section(title):
            # Find the section by its title
            section = soup.find('h2', text=title)
            if section:
                content = []
                next_node = section.find_next_sibling()
                while next_node and next_node.name != 'h2':
                    text = next_node.get_text(" ", strip=True)
                    cleaned_text = clean_text(text)
                    if cleaned_text:  # Only add non-empty strings
                        content.append(cleaned_text)
                    next_node = next_node.find_next_sibling()
                return "\n".join(content)
            else:
                return f"{title} section not found."

        # Titles of the sections to extract
        text_final = []
        titles = ['Introduction', 'Learning Outcomes', 'Summary']

        # Loop through the titles and print each section
        for title in titles:
            text_final.append(print_section(title))
        return text_final[0], text_final[1],text_final[2]
    else:
        print("Failed to retrieve the webpage")


In [16]:

def extract_information(url):
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the content of the page with BeautifulSoup\
        soup = BeautifulSoup(response.content, 'html.parser')
    else:
        print("error getting webpage")
    # Initialize variables to store extracted information
    pdf_link = None
    topic_name = None
    year = None
    level = None
    categories = []
    text_final = None
    parent_topic = None

    # Extract the PDF download link
    ## PDF link extraction
    for a_tag in soup.find_all('a'):
        if 'Download the full reading (PDF)' in a_tag.text:
            # pdf_link = "https://cfainstitute.org/" + a_tag.get('href')
            pdf_link = "https://www.cfainstitute.org" + a_tag.get('href')
            break

    # Extract the topic name
    title_tag = soup.find('title')
    if title_tag:
        topic_name = title_tag.text
    def clean_text(text):
        # Remove extra spaces, tabs, and newlines using regular expressions
        cleaned_text = re.sub(r'\s+', ' ', text)
        return cleaned_text.strip()

    # Extract the year and level from the content utility section
    content_utility = soup.find('div', class_='content-utility')
    if content_utility:
        text = content_utility.get_text()
        year, level, parent_topic = extract_info(clean_text(text))
#     print("------>", topic_name )
    
    card_title = soup.find('p', class_='card-title', text='Categories')
    if card_title:
        # Find all <p> tags following the 'Categories' card title
        for sibling in card_title.find_next_siblings('p'):
            a_tag = sibling.find('a')
            if a_tag and a_tag.text.strip():
                categories.append(a_tag.text.strip())
                
    Intro, LearningOutcome, Summary  = getContent(url)
    
    return pdf_link, topic_name, year, level, Intro, LearningOutcome, Summary , categories

In [17]:
import re


def extract_info(topic):
    parent_topics = [
        "Portfolio Management and Wealth Planning",
        "Fixed Income",
        "Corporate Finance",
        "Equity Investments",
        "Financial Reporting and Analysis",
        "Quantitative Methods",
        "Derivatives",
        "Alternative Investments",
        "Economics",
        "Ethical and Professional Standards"
    ]
    # Regex pattern for year, level, and parent topic
    pattern = r'(?P<year>20[0-2]\d) Curriculum CFA Program (?P<level>Level [I]{1,3}) (?P<topic>.+)'
    match = re.match(pattern, topic)

    if match:
        year = match.group('year')
        level = match.group('level')
        topic = match.group('topic')

        # Check if the extracted topic is in the predefined parent topics list
        for parent_topic in parent_topics:
            if parent_topic in topic:
                return year, level, parent_topic
import csv
import os

def write_to_csv(file_path, header, content):
    # Check if the CSV file already exists
    file_exists = os.path.isfile(file_path)
    
    # Open the file in append mode if it exists, or write mode if it doesn't
    with open(file_path, mode='a' if file_exists else 'w', newline='') as file:
        writer = csv.writer(file)
        
        # If the file didn't exist, write the header first
        if not file_exists:
            writer.writerow(header)
        
        # Write the content to the CSV file
        writer.writerow(content)

# Example usage
# file_path = 'example.csv'
# header = ['Column1', 'Column2', 'Column3']  # Define your header columns
# content = ['Data1', 'Data2', 'Data3']  # This should be a list representing a row of data

# write_to_csv(file_path, header, content)


## Main function to call 

In [18]:
def extract_topic_name(link):
    # Use regex to extract the topic name from the URL
    match = re.search(r'([^/]+)$', link)
    if match:
        topic_name = match.group(1)
        # Replace hyphens with spaces and convert to title case
        topic_name = topic_name.replace('-', ' ').title()
        return topic_name
    else:
        return "Topic name not found"

In [19]:
for url in urls:
    write_to_csv('FinanceHub.csv', ['pdf_link', 'Parent_topic', 'year', 'level', 'Introduction', 'LearningOutcome', 'Summary' , 'categories', 'url', "topicName"],[*extract_information(url),url,extract_topic_name(url)])

In [20]:
for i in urls:
    print(extract_topic_name(i))

Time Series Analysis
Credit Analysis Models
Introduction Alternative Investments
Credit Default Swaps
Valuation Contingent Claims
Introduction Commodities Commodity Derivatives
Understanding Income Statements
Pricing And Valuation Of Forward Commitments
Private Equity Investments
Valuation Analysis Bonds Embedded Options
