# Environment setup

In [1]:
import time
import datetime
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

from config import Config
import utils

# Functions

In [2]:
def get_driver(webdriver_browser):
    
    # Start driver
    if webdriver_browser == 'chrome':
        chrome_options = Options()
        driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()), options=chrome_options)
        driver.implicitly_wait(Config.IMPLICIT_WAIT_TIME) # seconds to wait

    elif webdriver_browser == 'selenium-api':
        driver = webdriver.Remote("http://selenium_standalone_chrome_financial_news_collectors:4444", options=webdriver.ChromeOptions())

    else:
        raise Exception('Specified webdriver not implemented. Please check: Config.WEBDRIVER_BROWSER')
        
    return driver

# Main

In [3]:
# Get URLs to download
urls = []
driver = get_driver(Config.WEBDRIVER_BROWSER)
for k in Config.NEWS_SETTINGS.keys():
    
    # Get download path
    download_path = Config.NEWS_SETTINGS[k]['DOWNLOAD_PATH'] + str(datetime.datetime.now().date()).replace('-', '')
    
    # Get URLs to download
    urls = Config.NEWS_SETTINGS[k]['URLS']

    # Download URLs
    for i, url in enumerate(urls):
        print(f'Progress: {i+1}/{len(urls)}. URL: {url}')

        # Go to url
        driver.get(url)

        # Wait to load page
        time.sleep(Config.WAIT_PAUSE_TIME)

        # Get HTML content
        html_doc = driver.page_source

        # Save webpage
        ## Create filename
        filename_datetime = str(datetime.datetime.now()).replace(' ', '_').replace('.', '_').replace('-', '').replace(':', '')
        filename = filename_datetime + '.json'

        ## Save webpage
        utils.save_webpage(url, html_doc, download_path, filename)
    
# Close driver
driver.close()

Progress: 1/7. URL: https://www.bbc.com/news
Progress: 2/7. URL: https://www.bbc.com/news/science-environment-56837908
Progress: 3/7. URL: https://www.bbc.com/news/world
Progress: 4/7. URL: https://www.bbc.com/news/business
Progress: 5/7. URL: https://www.bbc.com/news/technology
Progress: 6/7. URL: https://www.bbc.com/news/science_and_environment
Progress: 7/7. URL: https://www.bbc.com/news/stories
