## Download the title, image, summary, and file for each PDF 

In [1]:
import os
import csv
import json
import time
import requests
from selenium import webdriver
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from unidecode import unidecode

In [2]:
# Load the environment variables
load_dotenv("../.env")

True

In [3]:
def download_file(url, filepath):
    """Helper function to download a file from a given URL."""
    
    url = "https://rpc.cfainstitute.org" + url
    response = requests.get(url, stream = True)
    
    if response.status_code == 200:
        with open(filepath, "wb") as f:
            for chunk in response.iter_content(chunk_size = 8192):
                f.write(chunk)
    
    else:
        print(f"Failed to download: {url}")

In [7]:
def download(title, url):

    status = False

    try:
        # We need Javascript rendering to see the content
        # Ask Selenium to use Google Chrome as the driver

        # Set WebDriver options (headless mode to run without UI)
        options = Options()
        options.add_argument("--headless=new")
        
        # Ensure you download the right chrome driver from the URL mentioned above for your OS
        # Create a folder named 'chromedriver' and store the chromedriver in it 
        
        # chromedriver.exe will work only on Windows
        chromedriver_directory = os.path.join(os.path.dirname(os.getcwd()), "chromedriver", "chromedriver.exe")
        service = Service(chromedriver_directory)

        # Load the driver from Google Chrome
        driver = webdriver.Chrome(options = options, service = service)

        # Make a GET request
        driver.get(url)

        # Sleep for 10 seconds to ensure the page is completely loaded before proceeding
        time.sleep(10)

        # Fetch the source code of the webpage so we can process it
        page_source_code = driver.page_source

        # Call BeautifulSoup to parse the HTML content
        soup = BeautifulSoup(page_source_code, "html.parser")

        # Extract the PDF download url
        download_url = soup.find('a', class_="content-asset--primary").get("href")

        # Extract the book cover image
        cover_image_url = soup.find('img', class_="article-cover").get("src").split('?')[0]
        
        # Extract the overview (which will be used as summary)
        overview_content = soup.find_all('div', class_='article__paragraph')
        overview = ""

        for div in overview_content:
            paragraphs = div.find_all('p')
            for para in paragraphs:
                overview += unidecode(str(para.get_text()).strip().replace("\n", ""))

        # Create a directory with the same name as the title
        directory = title.replace(" ", "_").replace(":", "")
        os.makedirs(directory, exist_ok = True)

        # Download the PDF file
        pdf_filename = os.path.join(directory, f"{title.replace(' ', '_')}.pdf")
        download_file(download_url, pdf_filename)

        # Download the cover image
        cover_image_filename = os.path.join(directory, "cover_image.jpg")
        download_file(cover_image_url, cover_image_filename)

        # Create metadata.json and store relevant details
        metadata = {
            "title"             : title,
            "pdf_filename"      : os.path.basename(pdf_filename),
            "cover_image_url"   : cover_image_url,
            "pdf_download_url"  : download_url,
            "overview"          : overview
        }

        metadata_file = os.path.join(directory, "metadata.json")
        with open(metadata_file, "w") as f:
            json.dump(metadata, f, indent = 4)

        print(f"Downloaded and saved content for: {title}")
        status = True
    
    except Exception as exception:
        print(exception)
    
    finally:
        # Stop the webdriver
        driver.quit()
    
    return status

In [8]:
def crawl():

    csv_file = os.getenv("STAGE_1_FILENAME", None)
    
    try:
        with open(csv_file, 'r') as file:
            reader = csv.reader(file)

            for row in reader:
                if len(row) != 2:
                    print(f"Skipping invalid row: {row}")
                    continue 

                title, url = row
                print(f"Downloading: {title}")
                
                # Call download for each title and URL
                success = download(title, url)
                
                if success:
                    print(f"Downloaded: {title}")
                else:
                    print(f"Failed to download: {title}")

    except Exception as exception:
        print("Error occurred: ", exception)

In [9]:
# There is some issue with the PDF download logic

crawl()

Downloading: Beyond Active and Passive Investing: The Customization of Finance
Downloaded and saved content for: Beyond Active and Passive Investing: The Customization of Finance
Downloaded: Beyond Active and Passive Investing: The Customization of Finance
Downloading: Investment Model Validation: A Guide for Practitioners
Downloaded and saved content for: Investment Model Validation: A Guide for Practitioners
Downloaded: Investment Model Validation: A Guide for Practitioners
Downloading: The Economics of Private Equity: A Critical Review
Downloaded and saved content for: The Economics of Private Equity: A Critical Review
Downloaded: The Economics of Private Equity: A Critical Review
Downloading: Investment Horizon, Serial Correlation, and Better (Retirement) Portfolios
Downloaded and saved content for: Investment Horizon, Serial Correlation, and Better (Retirement) Portfolios
Downloaded: Investment Horizon, Serial Correlation, and Better (Retirement) Portfolios
Downloading: Valuation 

KeyboardInterrupt: 