In [None]:
"""
/// @file   Morningstart.ipynb
/// @author Austin Vandegriffe
/// @date   2020-09-17
/// @brief  A webscraper for the Morningstar website. This is an exercise
/// ## in webscraping and the following code is not to be misused. 
/// @style  K&R, and "one true brace style" (OTBS), and '_' variable naming
"""

# Preliminaries

You will need to have Selenium installed. The following demo uses Google's ChromeDriver. You need to:
<ol>
    <li>Open Chrome</li>
    <li>Click on the vertical ellipses in the upper right corner</li>
    <li>Click on settings</li>
    <li>In the left margin menu click "About Chrome"</li>
    <li>In the first box is a version number</li>
    <li>Go to "https://chromedriver.chromium.org/"</li>
    <li>Download the ChromeDriver that matches your version of Chrome</li>
    <li>Place the download in the same directory as this program (I found this mitigates errors)</li>
</ol>
After you have done the above procedure you should be good to proceed.

# Imports

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select

import os
import time

BASE_DIR = os.getcwd()

# Webdriver Setup & Startup

In [None]:
# Set Selenium download path
path = BASE_DIR

# Set webdriver options
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')

# Setup the browser metadata, can be found on the web
profile = {"plugins.plugins_list": [{"enabled":False,"name":"Chrome PDF Viewer"}],
        "download": {
            "prompt_for_download": False,
            "default_directory"  : f'{path}'
    },
    "headers": {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 
            "Accept-Language": "en-US,en;q=0.9", 
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
        },
    "json": None, 
    "method": "GET"
}

# Set the profile
options.add_experimental_option("prefs",profile)

# Create a browser instance
driver = webdriver.Chrome(options = options)

# Setup a page load waiting mechanism
driver.implicitly_wait(3)

# Morningstar

For the following to work, the webdriver must be maximized. Some HTML elements may be hidden due Javascript adjustments (such as a suppressed search bar).

### Go to the Morningstart website

In [None]:
driver.get("https://www.morningstar.com/")

### Search for the ticker you want

In [None]:
# Find the search bar
el = driver.find_elements_by_class_name("mdc-site-search")[0]
el.click()

# Click on searchbar to send keys
el = driver.find_elements_by_xpath('//*[@placeholder="Search Quotes and Site"]')[0]
el.click()

# Send ticker keys
ticker = "APPL"
el.send_keys(ticker)
el.submit()

# Pause
time.sleep(3)

### Get the first search result

In [None]:
# There is a <div> header for the search results; however, it does not immediately have an <a> 
## tag like the search results tags, so find the first div with an immediate <a> tag, i.e. 
## <section>/<div>/<a>
el = driver.find_elements_by_xpath('//section[@class="search-all__section"]/div/a')[0]
el.click()

### Close any popups

In [None]:
# Check for Popup. The particular popup I was exposed to has a "Dismiss" in the corner, so I 
## searched for all places "Dismiss" showed up
for el in driver.find_elements_by_xpath('//span[contains(text(), "Dismiss")]'):
    el.click()

### Open the chart widget

In [None]:
# Display chart
el = driver.find_elements_by_xpath('//*[@class="chart-iframe-full-chart-label"]')[0]
el.click()

# The chart takes a second to load, so sleep for a couple seconds, this is also good practice
## as it slows the crawler down a bit making it less suspicous.
time.sleep(5)

### Select start date

In [None]:
# Select start date dropdown
el = driver.find_elements_by_xpath('//div[@data-menuid="mkts-cmpt-svgcht-start-date"]')[0]
el.click()

# Pause
time.sleep(0.5)

In [None]:
# Display month/year selection
el = driver.find_elements_by_xpath('//div[@class="mkts-cmpt-datepicker-label"]')[0]
el.click()

# Pause
time.sleep(0.5)

In [None]:
# Scroll year of interest into view
year_of_interest = 1997
dropdown_years = set([])
while str(year_of_interest) not in dropdown_years:
    el = driver.find_elements_by_xpath('//ul[@class="mkts-cmpt-datepicker-select mkts-cmpt-datepicker-select-year mkts-cmpt-datepicker-content-panel"]')[0]
    driver.execute_script('arguments[0].focus(); arguments[0].scrollTop = 0;', el)
    tmp = driver.find_elements_by_xpath('//ul[@class="mkts-cmpt-datepicker-select mkts-cmpt-datepicker-select-year mkts-cmpt-datepicker-content-panel"]/li')
    dropdown_years = dropdown_years.union([t.text for t in tmp])

In [None]:
# Select the year of interst to display the months
el = driver.find_elements_by_xpath(f'//ul[@class="mkts-cmpt-datepicker-select mkts-cmpt-datepicker-select-year mkts-cmpt-datepicker-content-panel"]/li[@data-year="{year_of_interest}"]')[0]
el.click()

# Pause
time.sleep(1)

In [None]:
# Select the numeric month of interest, i.e. January = 1 and December = 12
month_of_interest = 9
# The tags are in a list, hence they are ZERO indexed, so take 1 away from the month_of_interest
## to select appropriate month
el = driver.find_elements_by_xpath(f'//ul[@class="mkts-cmpt-datepicker-select mkts-cmpt-datepicker-select-month"]/li[@data-month="{month_of_interest-1}"]')[0]
el.click()

# Pause
time.sleep(1)

In [None]:
# Select the day of interest
day_of_interest = 30
days = driver.find_elements_by_class_name('mkts-cmpt-datepicker-day')
# The FULL weeks are displayed so if the first is on a Wednesday, the ending days from
## the previous month will be displayed for Sunday-Tuesday, omit these in case you are
## looking for the 30 of the month as there may be two 30s that show up
while int(days[0].text) != 1:
    days = days[1:]
# We don't need to filter the end of the month but here is how I would do it, each month
## ends with at least 28 days, so if day/10 < 2 it will be the beginning of the next month
## so we omit those.
while int(days[-1].text)/10 < 1:
    days = days[:-1]

# Now get the element for the day of interest, since the list, again, is zero indexed
## we must subtract 1 from the date of interest.
el = days[day_of_interest-1]
el.click()

# Pause
time.sleep(3)

### Select end date

In [None]:
# Select start date dropdown
el = driver.find_elements_by_xpath('//div[@data-menuid="mkts-cmpt-svgcht-end-date"]')[0]
el.click()

# Pause
time.sleep(0.5)

In [None]:
# Display month/year selection
el = driver.find_elements_by_xpath('//div[@class="mkts-cmpt-datepicker-label"]')[0]
el.click()

# Pause
time.sleep(0.5)

In [None]:
# Scroll year of interest into view
year_of_interest = 1998
dropdown_years = set([])
while str(year_of_interest) not in dropdown_years:
    el = driver.find_elements_by_xpath('//ul[@class="mkts-cmpt-datepicker-select mkts-cmpt-datepicker-select-year mkts-cmpt-datepicker-content-panel"]')[0]
    driver.execute_script('arguments[0].focus(); arguments[0].scrollTop = 0;', el)
    tmp = driver.find_elements_by_xpath('//ul[@class="mkts-cmpt-datepicker-select mkts-cmpt-datepicker-select-year mkts-cmpt-datepicker-content-panel"]/li')
    dropdown_years = dropdown_years.union([t.text for t in tmp])

In [None]:
# Select the year of interst to display the months
el = driver.find_elements_by_xpath(f'//ul[@class="mkts-cmpt-datepicker-select mkts-cmpt-datepicker-select-year mkts-cmpt-datepicker-content-panel"]/li[@data-year="{year_of_interest}"]')[0]
el.click()

# Pause
time.sleep(1)

In [None]:
# Select the numeric month of interest, i.e. January = 1 and December = 12
month_of_interest = 9
# The tags are in a list, hence they are ZERO indexed, so take 1 away from the month_of_interest
## to select appropriate month
el = driver.find_elements_by_xpath(f'//ul[@class="mkts-cmpt-datepicker-select mkts-cmpt-datepicker-select-month"]/li[@data-month="{month_of_interest-1}"]')[0]
el.click()

# Pause
time.sleep(1)

In [None]:
# Select the day of interest
day_of_interest = 30
days = driver.find_elements_by_class_name('mkts-cmpt-datepicker-day')
# The FULL weeks are displayed so if the first is on a Wednesday, the ending days from
## the previous month will be displayed for Sunday-Tuesday, omit these in case you are
## looking for the 30 of the month as there may be two 30s that show up
while int(days[0].text) != 1:
    days = days[1:]
# We don't need to filter the end of the month but here is how I would do it, each month
## ends with at least 28 days, so if day/10 < 2 it will be the beginning of the next month
## so we omit those.
while int(days[-1].text)/10 < 1:
    days = days[:-1]

# Now get the element for the day of interest, since the list, again, is zero indexed
## we must subtract 1 from the date of interest.
el = days[day_of_interest-1]
el.click()

# Pause
time.sleep(3)

### Prepare data for download

In [None]:
# Rather than specifying a specific date, you can download the previous
## 'one-day', 'five-day', 'fifteen-day', 'one-month', 'three-month', 'six-month'
## 'y-t-d', 'one-year', 'three-year', 'five-year', 'ten-year','max'
'''
el = driver.find_elements_by_xpath('//*[@data-menuid="one-day"]')[0]
el.click()
#'''

# Select courseness dropdown
el = driver.find_elements_by_xpath('//div[@class="mkts-cmpt-svgcht-freq-container"]')[0]
el.click()

# Select courseness
## 'd'=day; 'w'=week; 'm'=month
## NOTE: for the most recent previous 3 months you can get 10,15,30, and 60 minute data
## along with daily, weekly, and monthly. For the most recent 15 days you can get 5 minute data
## along with all the prior one. For the most recent 5 days (including weekends) you can get 1 minute data
el = driver.find_elements_by_xpath('//ul[@class="ul-freq-dropdown-list"]/li[@data-value="d"]/span')[0]
el.click()

# Download the data
el = driver.find_elements_by_xpath('//span[@data-menuid="mkts-cmpt-svgcht-menuicn--export"]')[0]
el.click()

# Closing Remarks

The above code can be put in a loop or run be made to run in parallel. However, if you are scraping a website make sure you are kind to them. Place pauses (as I have) to slow down the scraper so that you are not bombarding the site with requests.