## URLClass

In [141]:
import pytest
from pydantic import BaseModel, HttpUrl, ValidationError, constr, field_validator
from typing import Optional

class URLClass(BaseModel):
    title: constr(min_length=1, max_length=200)
    topic: constr(min_length=1, max_length=200)
    published_year: int
    level: constr(pattern=r'^Level\s(I|II|III)$')
    introduction: Optional[str]
    learning_outcomes: Optional[str]
    summary: Optional[str]
    overview: Optional[str]
    link: HttpUrl

    @field_validator('published_year')
    @classmethod
    def validate_published_year(cls, value):
        if not (2018 <= value <= 2024):
            raise ValueError('Published year must be between 2018 and 2024')
        return value

    @field_validator('learning_outcomes')
    @classmethod
    def validate_learning_outcomes(cls, value):
        if len(value.split()) < 10:
            raise ValueError('Learning outcomes must be at least 10 words long')
        return value

    @field_validator('title')
    @classmethod
    def validate_title(cls, value):
        if not value.strip():
            raise ValueError('Title cannot be empty')
        return value

    @field_validator('topic')
    @classmethod
    def validate_topic(cls, value):
        if not value.strip():
            raise ValueError('Topic cannot be empty')
        return value

    @field_validator('level')
    @classmethod
    def validate_level(cls, value):
        if value not in ['Level I', 'Level II', 'Level III']:
            raise ValueError('Level must be one of: "Level I", "Level II", "Level III"')
        return value

    @field_validator('introduction')
    @classmethod
    def validate_introduction(cls, value):
        if value and len(value) < 50:
            raise ValueError('Introduction must be at least 50 characters long')
        return value

    @field_validator('summary')
    @classmethod
    def validate_summary(cls, value):
        if value and len(value.split()) < 10:
            raise ValueError('Summary must be at least 10 words long')
        return value

    @field_validator('overview')
    @classmethod
    def validate_overview(cls, value):
        if value and len(value.split()) < 10:
            raise ValueError('Overview must be at least 10 words long')
        return value

    @field_validator('link')
    @classmethod
    def validate_link(cls, value):
        if not str(value).startswith('https://www.cfainstitute.org'):
            raise ValueError('Link must start with "https://www.cfainstitute.org"')
        return value

# Testing the validations
# Add separate test functions for each validation


In [154]:
import pytest
from pydantic import ValidationError

def test_valid_data(data):
    assert URLClass(**data)
    print("Valid data")
    
@pytest.mark.parametrize("invalid_title", ["", None])
def test_invalid_title(invalid_title):
    data = {
        "title": invalid_title,
        "topic": "Quantitative Methods",
        "published_year": 2024,
        "level": "Level II",
        "introduction": "Investment firms are increasingly using technology at every step of the investment management value chain—from improving their understanding of clients to uncovering new sources of alpha and executing trades more efficiently. Machine learning techniques, a central part of that technology, are the subject of this reading. These techniques first appeared in finance in the 1990s and have since flourished with the explosion of data and cheap computing power.This reading provides a high-level view of machine learning (ML).",
        "learning_outcomes": "The member should be able to: describe supervised machine learning, unsupervised machine learning, and deep learning;",
        "summary": "Machine learning methods are gaining usage at many stages in the investment management value chain. Among the major points made are the following",
        "overview": "",
        "link": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/machine-learning"
    }
    with pytest.raises(ValidationError) as excinfo:
        URLClass(**data)
    # Print the captured exception
#     print(excinfo)
    # Or print the exception message
    print(str(excinfo.value))

# Test function for invalid published_year
@pytest.mark.parametrize("invalid_published_year", [1999, 2051])
def test_invalid_published_year(invalid_published_year):
    data = {
        "title": "Machine Learning",
        "topic": "Quantitative Methods",
        "published_year": invalid_published_year,
        "level": "Level II",
        "introduction": "Investment firms are increasingly using technology at every step of the investment management value chain—from improving their understanding of clients to uncovering new sources of alpha and executing trades more efficiently. Machine learning techniques, a central part of that technology, are the subject of this reading. These techniques first appeared in finance in the 1990s and have since flourished with the explosion of data and cheap computing power.This reading provides a high-level view of machine learning (ML).",
        "learning_outcomes": "The member should be able to: describe supervised machine learning, unsupervised machine learning, and deep learning;",
        "summary": "Machine learning methods are gaining usage at many stages in the investment management value chain. Among the major points made are the following",
        "overview": "",
        "link": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/machine-learning"
    }
    # Ensure validation error is raised for invalid published year
    with pytest.raises(ValidationError) as excinfo:
        URLClass(**data)
    print(str(excinfo.value))

@pytest.mark.parametrize("invalid_link", ["invalid_link", "ftp://example.com"])
def test_invalid_link(invalid_link):
    data = {
        "title": "Machine Learning",
        "topic": "Quantitative Methods",
        "published_year": 2024,
        "level": "Level II",
        "introduction": "Investment firms are increasingly using technology at every step of the investment management value chain—from improving their understanding of clients to uncovering new sources of alpha and executing trades more efficiently. Machine learning techniques, a central part of that technology, are the subject of this reading. These techniques first appeared in finance in the 1990s and have since flourished with the explosion of data and cheap computing power.This reading provides a high-level view of machine learning (ML).",
        "learning_outcomes": "The member should be able to: describe supervised machine learning, unsupervised machine learning, and deep learning;",
        "summary": "Machine learning methods are gaining usage at many stages in the investment management value chain. Among the major points made are the following",
        "overview": "",
        "link": invalid_link  # Invalid link
    }
    # Ensure validation error is raised for invalid link
    with pytest.raises(ValidationError) as excinfo:
        URLClass(**data)
    print(str(excinfo.value))


### VALID DATA EXAMPLE 1

In [156]:
data1 = {
        "title": "Time Value of Money in Finance",
        "topic": "Quantitative Methods",
        "published_year": 2024,
        "level": "Level I",
        "introduction": "Faced with an overwhelming amount of data, analysts must deal with the task of wrangling those data into something that provides a clearer picture of what is going on. We use the concepts and tools of hypothesis testing to address these issues. Hypothesis testing is part of statistical inference, the process of making judgments about a larger group (a population) based on a smaller group of observations (that is, a sample). The concepts and tools of hypothesis testing provide an objective means to gauge whether the available evidence supports the hypothesis. After applying a statistical test of a hypothesis, we should have a clearer idea of the probability that a hypothesis is true or not, although our conclusion always stops short of certainty.",
        "learning_outcomes": "The member should be able to: define a hypothesis, describe the steps of hypothesis testing, and describe and interpret the choice of the null and alternative hypotheses; compare and contrast one-tailed and two-tailed tests of hypotheses; explain a test statistic, Type I and Type II errors, a significance level, how significance levels are used in hypothesis testing, and the power of a test; explain a decision rule and the relation between confidence intervals and hypothesis tests, and determine whether a statistically significant result is also economically meaningful.",
        "summary": "In this reading, we have presented the concepts and methods of statistical inference and hypothesis testing. A hypothesis is a statement about one or more populations. The steps in testing a hypothesis are as follows: State the hypotheses. Identify the appropriate test statistic and its probability distribution. Specify the significance level. State the decision rule. Collect the data and calculate the test statistic. Make a decision.",
        "overview": "",
        "link": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/hypothesis-testing"
    }
test_valid_data(data1)

Valid data


### VALID DATA EXAMPLE 2

In [159]:
data2 = {
        "title": "Hypothesis Testing",
        "topic": "Quantitative Methods",
        "published_year": 2024,
        "level": "Level I",
        "introduction": "Faced with an overwhelming amount of data, analysts must deal with the task of wrangling those data into something that provides a clearer picture of what is going on. We use the concepts and tools of hypothesis testing to address these issues. Hypothesis testing is part of statistical inference, the process of making judgments about a larger group (a population) based on a smaller group of observations (that is, a sample). The concepts and tools of hypothesis testing provide an objective means to gauge whether the available evidence supports the hypothesis. After applying a statistical test of a hypothesis, we should have a clearer idea of the probability that a hypothesis is true or not, although our conclusion always stops short of certainty.",
        "learning_outcomes": "The member should be able to: define a hypothesis, describe the steps of hypothesis testing, and describe and interpret the choice of the null and alternative hypotheses; compare and contrast one-tailed and two-tailed tests of hypotheses; explain a test statistic, Type I and Type II errors, a significance level, how significance levels are used in hypothesis testing, and the power of a test; explain a decision rule and the relation between confidence intervals and hypothesis tests, and determine whether a statistically significant result is also economically meaningful.",
        "summary": "In this reading, we have presented the concepts and methods of statistical inference and hypothesis testing. A hypothesis is a statement about one or more populations. The steps in testing a hypothesis are as follows: State the hypotheses. Identify the appropriate test statistic and its probability distribution. Specify the significance level. State the decision rule. Collect the data and calculate the test statistic. Make a decision.",
        "overview": "",
        "link": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/hypothesis-testing"
    }
test_valid_data(data2)

Valid data


### VALID DATA EXAMPLE 3

In [157]:
data2 = {
        "title": "Machine Learning",
        "topic": "Quantitative Methods",
        "published_year": 2024,
        "level": "Level II",
        "introduction": "Investment firms are increasingly using technology at every step of the investment management value chain—from improving their understanding of clients to uncovering new sources of alpha and executing trades more efficiently. Machine learning techniques, a central part of that technology, are the subject of this reading. These techniques first appeared in finance in the 1990s and have since flourished with the explosion of data and cheap computing power.This reading provides a high-level view of machine learning (ML).",
        "learning_outcomes": "The member should be able to: describe supervised machine learning, unsupervised machine learning, and deep learning;",
        "summary": "Machine learning methods are gaining usage at many stages in the investment management value chain. Among the major points made are the following",
        "overview": "",
        "link": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/machine-learning"
    }
test_valid_data(data2)

Valid data


### INVALID DATA EXAMPLE 1

In [161]:
test_invalid_published_year(1992)

1 validation error for URLClass
published_year
  Value error, Published year must be between 2018 and 2024 [type=value_error, input_value=1992, input_type=int]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error


### INVALID DATA EXAMPLE 2

In [192]:
data3 = {
        "title": "Interest Rate Risk and Return",
        "topic": "Fixed Income",
        "published_year": 2024,
        "level": "Level I",
        "introduction": "",
        "learning_outcomes": "",
        "summary": "",
        "overview": "Prior lessons on yield measures established that a fixed-income investor’s rate of return will equal a bond’s yield-to-maturity (YTM) under certain assumptions. In these lessons, we explore the sources of return for fixed-income investments and demonstrate investment returns in different scenarios, including the one embedded in the YTM calculations. Prior lessons also established interest rate risk. We show how investment horizon, in relation to a bond’s features, is a key determinant of interest rate risk for investors and how different investors in the same fixed-income invest- ment can have different returns and views on risk.",
        "link": "https://www.cfainstitute.org/membership/professional-development/refresher-readings/Interest-Rate-Risk-and-Return"
    }
test_valid_data(data3)

ValidationError: 1 validation error for URLClass
learning_outcomes
  Value error, Learning outcomes must be at least 10 words long [type=value_error, input_value='', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error

### INVALID DATA EXAMPLE 3

In [163]:
test_invalid_title("")

1 validation error for URLClass
title
  String should have at least 1 character [type=string_too_short, input_value='', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/string_too_short


### INVALID DATA EXAMPLE 4

In [164]:
test_invalid_link("ftp://example.com")

1 validation error for URLClass
link
  URL scheme should be 'http' or 'https' [type=url_scheme, input_value='ftp://example.com', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/url_scheme


### INVALID DATA EXAMPLE 5

In [165]:
test_invalid_link("https://example.com")

1 validation error for URLClass
link
  Value error, Link must start with "https://www.cfainstitute.org" [type=value_error, input_value='https://example.com', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error


## MetaDataPDFClass

In [196]:
import pytest
from pydantic import BaseModel, ValidationError, FilePath
from pathlib import Path

class MetaDataPDFClass(BaseModel):
    text: constr(min_length=1)
    section_title: constr(min_length=1)
    file_path: Path
    para:str
    pages:str
        
    @field_validator('text')
    @classmethod
    def validate_text(cls, value):
        if not value.strip():  # Check if text is not empty or whitespace
            raise ValueError('Text cannot be empty')
        return value

    @field_validator('section_title')
    @classmethod
    def validate_section_title(cls, value):
        if not value.strip():  # Check if section_title is not empty or whitespace
            raise ValueError('Section title cannot be empty')
        return value

    @field_validator('file_path')
    @classmethod
    def validate_file_path(cls, value):
        if not value.exists():  # Check if file_path exists
            raise ValueError('File path does not exist')
        return value



# def test_invalid_text():
#     # Invalid text (empty)
#     data = {
#         "text": "",
#         "section_title": "",
#         "file_path": "/Users/sudarshan/Big_Data",
#         "para":"0",
#         "pages":"('1', '1')"
#     }
#     assert MetaDataPDFClass(**data)
    
@pytest.mark.parametrize("invalid_text", "")
def test_invalid_text(invalid_text):
    data = {
        "text": invalid_text,
        "section_title": "Introduction",
        "file_path": "/Users/sudarshan/Big_Data/data.json",
        "para":"0",
        "pages":"('1', '1')"
        
    }
    # Ensure validation error is raised for invalid link
    with pytest.raises(ValidationError) as excinfo:
        MetaDataPDFClass(**data)
    print(str(excinfo.value))

def test_valid_meta_data():
    # Valid data
    data = {
        "text": "Some text from PDF",
        "section_title": "Introduction",
        "file_path": "/Users/sudarshan/Big_Data/data.json",
        "para":"0",
        "pages":"('1', '1')"
        
    }
    # Ensure no validation error is raised
    assert MetaDataPDFClass(**data)


@pytest.mark.parametrize("file_path", "/path/to/nonexistent/file.pdf")
def test_invalid_file_path(file_path):
    # Invalid file path (file does not exist)
    data = {
        "text": "Some text from PDF",
        "section_title": "Introduction",
        "file_path": file_path,
        "para":"0",
        "pages":"('1', '1')"
    }
     # Ensure validation error is raised for invalid link
    with pytest.raises(ValidationError) as excinfo:
        MetaDataPDFClass(**data)
    print(str(excinfo.value))


In [197]:
try:
    test_valid_meta_data()
except ValidationError as e:
    print(e)
    


### INVALID DATA EXAMPLE 6

In [200]:
try:
    test_invalid_text("")
except ValidationError as e:
    print(e)

1 validation error for MetaDataPDFClass
text
  String should have at least 1 character [type=string_too_short, input_value='', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/string_too_short


### INVALID DATA EXAMPLE 7

In [201]:
try:
    test_invalid_file_path("/path/to/nonexistent/file.pdf")
except ValidationError as e:
    print(e)

1 validation error for MetaDataPDFClass
file_path
  Value error, File path does not exist [type=value_error, input_value='/path/to/nonexistent/file.pdf', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error


## ContentPDFClass

In [206]:
class ContentPDFClass(BaseModel):
    file_name: constr(pattern=r'^[A-Za-z\s]+$')
    extracted_content:  str


@pytest.mark.parametrize("file_name", "____")
def test_invalid_file_name(file_name):
    # Invalid file path (file does not exist)
    data = {
        "file_name": file_name,
        "extracted_content": "example extracted content"
    }
     # Ensure validation error is raised for invalid link
    with pytest.raises(ValidationError) as excinfo:
        ContentPDFClass(**data)
    print(str(excinfo.value))
    



### INVALID DATA EXAMPLE 8

In [207]:
try:
    test_invalid_file_name("_____")
except ValidationError as e:
    print(e)

1 validation error for ContentPDFClass
file_name
  String should match pattern '^[A-Za-z\s]+$' [type=string_pattern_mismatch, input_value='_____', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/string_pattern_mismatch


## Implemented it in scrapping

In [152]:
!pip install chromedriver-autoinstaller

import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')


from selenium import webdriver
import chromedriver_autoinstaller


# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()





'/Users/sudarshan/anaconda3/lib/python3.11/site-packages/chromedriver_autoinstaller/122/chromedriver'

In [153]:
#!/usr/bin/env python
# coding: utf-8

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
import pandas as pd
from selenium.common.exceptions import NoSuchElementException

def initialize_driver():
    # setup chrome options
    chrome_options = webdriver.ChromeOptions()
#     chrome_options.add_argument('--headless') # ensure GUI is off
#     chrome_options.add_argument('--no-sandbox')
#     chrome_options.add_argument('--disable-dev-shm-usage')
    # set path to chromedriver as per your configuration
    chromedriver_autoinstaller.install()
    driver = webdriver.Chrome(options=chrome_options)
    driver.maximize_window()
    return driver

def close_privacy_warning(driver):
    close_button = driver.find_element(By.ID, "closePrivacyWarning")
    close_button.click()

def click_next_button(driver):
    try:
        next_button = driver.find_element(By.CLASS_NAME, "coveo-pager-next")
        next_button.click()
        time.sleep(2)
        return driver
    except NoSuchElementException:
        return None

def scrape(driver, refresher_readings_list):
    time.sleep(2)  # Wait for the page to load
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')
    titles = soup.find_all('h4', class_='coveo-title')
    for title in titles:
        link = title.find('a', class_='CoveoResultLink')['href']
        reading = [title.text.strip(), link]
        refresher_readings_list.append(reading)

def get_reading_detail_data(driver, reading):
    driver.get(reading[1])
    time.sleep(5)
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')

    meta_data = soup.find('div', class_="content-utility")
    span_elements = meta_data.find_all('span', class_=['content-utility-curriculum', 'content-utility-topic'])

    data = {
        "title": "",
        "topic": "",
        "published_year": "",
        "level": "",
        "introduction": "",
        "learning_outcomes": "",
        "summary": "",
        "overview": "",
        "link": ""
    }

    # Extract text content from selected span elements
    if len(span_elements) >= 3:  # Ensure 'curriculum', 'topic', and 'level' span elements are present
        data["published_year"] = span_elements[0].text.strip().split()[0]
        data["level"] = span_elements[1].text.strip()
        data["topic"] = span_elements[2].text.strip()

    # Extract data from other sections
    headings = soup.find_all('h2', class_="article-section")
    for section in headings:
        if section.text in ('Introduction', "Learning Outcomes", "Summary", "Overview"):
            if section.text == "Introduction":
                data["introduction"] = section.findParent().text.strip()
            elif section.text == "Learning Outcomes":
                data["learning_outcomes"] = section.find_next().text.strip()
            elif section.text == "Summary":
                data["summary"] = section.find_next().text.strip()
            elif section.text == "Overview":
                data["overview"] = section.find_next().text.strip()

    return data

        
def scrape_reading_detail(refresher_readings_list):
    data_list = []
    driver = initialize_driver()
    for reading in refresher_readings_list:
        reading_detail = get_reading_detail_data(driver, reading)
        
        reading_detail['title'] = reading[0]
        reading_detail['link'] = reading[1]
        try:
            validated_data = URLClass(**reading_detail)
            # If validation is successful, append the validated data to data_list
            data_list.append(reading_detail)
        except ValidationError as e:
            print(reading[0])
            # If validation fails, print the error message and continue to the next reading
            print(f"Validation error for reading: {str(e)}")
        
    driver.quit()
    df = pd.DataFrame(data_list)
    return df


# def main():
refresher_readings_list = []
driver = initialize_driver()
url = "https://www.cfainstitute.org/en/membership/professional-development/refresher-readings#first=10&sort=%40refreadingcurriculumyear%20descending"
driver.get(url)
close_privacy_warning(driver)
for page_num in range(1):
    scrape(driver, refresher_readings_list)
    driver = click_next_button(driver)
    if driver is None:
        break
df = scrape_reading_detail(refresher_readings_list)
print(df)
df.to_csv('refresher_readings.csv', index=False)
# driver.quit()

# if __name__ == "__main__":
#     main()



                                               title  \
0                               Time-Series Analysis   
1                             Credit Analysis Models   
2            Introduction to Alternative Investments   
3                               Credit Default Swaps   
4                     Valuation of Contingent Claims   
5  Introduction to Commodities and Commodity Deri...   
6                    Understanding Income Statements   
7       Pricing and Valuation of Forward Commitments   
8                         Private Equity Investments   
9  Valuation and Analysis of Bonds with Embedded ...   

                              topic published_year     level  \
0              Quantitative Methods           2024  Level II   
1                      Fixed Income           2024  Level II   
2           Alternative Investments           2023   Level I   
3                      Fixed Income           2024  Level II   
4                       Derivatives           2024  Level II   

In [140]:
df

Unnamed: 0,title,topic,published_year,level,introduction,learning_outcomes,summary,overview,link,year
0,Time-Series Analysis,Quantitative Methods,,Level II,"Introduction\nAs financial analysts, we often ...",The member should be able to:\n\n\n\ncalculate...,The predicted trend value of a time series in ...,,https://www.cfainstitute.org/membership/profes...,2024
1,Credit Analysis Models,Fixed Income,,Level II,Introduction\nCredit analysis plays an importa...,The member should be able to:\n\nexplain expec...,This reading has covered several important top...,,https://www.cfainstitute.org/membership/profes...,2024
2,Introduction to Alternative Investments,Alternative Investments,,Level I,"Introduction\nIn this section, we explain what...",The member should be able to:\n\ndescribe type...,This reading provides a comprehensive introduc...,,https://www.cfainstitute.org/membership/profes...,2023
3,Credit Default Swaps,Fixed Income,,Level II,Introduction\nDerivative instruments in which ...,The member should be able to:\n\ndescribe cred...,A credit default swap (CDS) is a contract betw...,,https://www.cfainstitute.org/membership/profes...,2024
4,Valuation of Contingent Claims,Derivatives,,Level II,Introduction\nA contingent claim is a derivati...,The member should be able to:\n\n\ndescribe an...,This reading on the valuation of contingent cl...,,https://www.cfainstitute.org/membership/profes...,2024
5,Introduction to Commodities and Commodity Deri...,Alternative Investments,,Level II,"Introduction\nIn the upcoming sections, we pre...",The member should be able to:\n\n\ncompare cha...,Commodities are a diverse asset class comprisi...,,https://www.cfainstitute.org/membership/profes...,2024
6,Understanding Income Statements,Financial Reporting and Analysis,,Level I,Introduction\nThe income statement presents in...,The member should be able to:\n\n\ndescribe th...,This reading has presented the elements of inc...,,https://www.cfainstitute.org/membership/profes...,2023
7,Pricing and Valuation of Forward Commitments,Derivatives,,Level II,Introduction\nForward commitments include forw...,The member should be able to:\n\n\ndescribe th...,This reading on forward commitment pricing and...,,https://www.cfainstitute.org/membership/profes...,2024
8,Private Equity Investments,Alternative Investments,,Level II,Introduction\nPrivate equity’s shift from a ni...,The member should be able to:\n\n\nexplain sou...,Private equity funds seek to add value by vari...,,https://www.cfainstitute.org/membership/profes...,2023
9,Valuation and Analysis of Bonds with Embedded ...,Fixed Income,,Level II,"Introduction\nThe valuation of a fixed-rate, o...",The member should be able to:\n\ndescribe fixe...,An embedded option represents a right that can...,,https://www.cfainstitute.org/membership/profes...,2024


In [138]:
refresher_readings_list

NameError: name 'refresher_readings_list' is not defined

In [None]:
df = scrape_reading_detail(refresher_readings_list)