In [1]:
from bs4 import BeautifulSoup as bs
from splinter import Browser
import pandas as pd
import time
import datetime as dt

In [2]:
# from website
ts = int(863654400)
date_found = dt.datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
date_found

'1997-05-15 00:00:00'

In [3]:
# get the start date in unix time
# HTTP request asks for this
d = dt.date(2020,6,6)

# it appears (there is a 2 hour timezone difference)
unixtime_start = time.mktime(d.timetuple()) + 72000
unixtime_start

1591488000.0

In [4]:
# get the end date in unix time
# HTTP request asks for this
d = dt.date(1997,5,14)

# it appears (there is a 2 hour timezone difference)
unixtime_end = time.mktime(d.timetuple()) + 72000
unixtime_end

863654400.0

In [5]:
# check you version of chrome browser
# use the compatible chromedriver.exe
# mine is 'ChromeDriver 83.0.4103.39'
# 'https://chromedriver.chromium.org/downloads'
executable_path = {'executable_path':'./chromedriver.exe'}

# use Python context manager (with) so I do not need
# to call browser.quit() at the end
# instead Python will manage that for me :)
with Browser('chrome', **executable_path) as browser:
    # URL creation for HTTP request
    stock_ticker = "AMZN"
    tab_selection = "history"
    period2 = int(unixtime_start)
    period1 = int(unixtime_end)
    interval = "1d"
    filter_stocks = "history&frequency=1d"
    # custom URL
    # 'https://ca.finance.yahoo.com/quote/AMZN/history?period1=863654400&period2=1591488000&interval=1d&filter=history&frequency=1d'
    url = (
           f"https://ca.finance.yahoo.com/quote/" +
           f"{stock_ticker}/{tab_selection}?period1={period1}" +
           f"&period2={period2}&interval={interval}&" +
           f"filter={filter_stocks}"
           )
    # visit the url
    browser.visit(url)
    # sleep on the page for a couple of seconds cause it is
    # resource intensive
    time.sleep(6)
    # this while loop will scroll through the infinitely scrolling
    # element
    # this will allow for the HTML table (that is dynamically generated)
    # to be in the HTML
    # get the current height of the <div id=render-target-default>
    lastHeight = browser.execute_script(r"return document.querySelector('#render-target-default').scrollHeight")
    while True:
        # scroll to the bottom of the div
        browser.execute_script("window.scrollTo(0, document.querySelector('#render-target-default').scrollHeight);")
        time.sleep(1)
        # check for the new height of the div
        newHeight = browser.execute_script("return document.querySelector('#render-target-default').scrollHeight")
        # if the new height and the last height are the same
        # then shut down the while loop
        if newHeight == lastHeight:
            break
        # set the last Height to the new height and iterate again
        lastHeight = newHeight
        
    # use pandas.read_html() to get the html table as a
    # pandas dataframe
    html = browser.html
    soup = bs(html, "html.parser")
    df_list = pd.read_html(html)
    df = df_list[0]

# show some data that was scraped from the HTML table
print(df.shape)
print(df.columns)
df.head()

(5807, 7)
Index(['Date', 'Open', 'High', 'Low', 'Close*', 'Adj Close**', 'Volume'], dtype='object')


Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume
0,"Jun. 05, 2020",2444.51,2488.65,2437.13,2483.0,2483.0,3304000
1,"Jun. 04, 2020",2477.43,2507.54,2450.01,2460.6,2460.6,2948700
2,"Jun. 03, 2020",2468.01,2488.0,2461.17,2478.4,2478.4,2671000
3,"Jun. 02, 2020",2467.0,2473.53,2445.31,2472.41,2472.41,2529900
4,"Jun. 01, 2020",2448.0,2476.93,2444.17,2471.04,2471.04,2928900
