In [1]:
import numpy as np
import pandas as pd

from datetime import datetime, date

import pandas_datareader.data as web
from pandas_datareader.yahoo.headers import DEFAULT_HEADERS

import yfinance as yf # for minute data, https://github.com/ranaroussi/yfinance

import requests
import time

from joblib import Parallel, delayed

In [2]:
yf.__file__

'/opt/anaconda3/envs/zipreload/lib/python3.8/site-packages/yfinance/__init__.py'

In [3]:
# Group of Securities

# portfolio holdings
etf_holdings = ['TQQQ', 'UPRO']
stock_holdings = ['META', 'AAPL', 'PLTR', 'GPRO', 'C']

# trades executed with year
etf_trades = ['TLT', 'SOXL', 'SQQQ', 'DIA', 'USO']
stock_trades = ['ABNB', 'ADP', 'AEO', 'AMC', 'AMZN', 'AXP', 'BAC', 'BIIB', 'BTU', 'BX', \
                'CGC', 'CHPT', 'CI', 'CLF', 'CLOV', 'CMCSA', 'COST', 'CRM', 'DAL', 'DE', \
                'DIS', 'DOCU', 'DOW', 'DVN', 'GME', 'GOOGL', 'HD', 'IBM', 'JNJ', 'KR', \
                'LMT', 'M', 'MRK', 'MSFT', 'MU', 'NFLX', 'NIO', 'NKE', 'NVDA', 'OKTA', \
                'PANW', 'PEP', 'PFE', 'PINS', 'PYPL', 'RAD', 'RBLX', 'RKT', 'SE', 'SNAP', \
                'SNOW', 'SOFI', 'TSLA', 'TSM', 'UAL', 'UNH', 'UPS', 'VZ', 'WFC', 'WMT', \
                'XOM', 'ZM']

# index etfs for tracking
index_etfs = ['^SPX', '^VIX', 'SPY', 'QQQ', 'IWM', 'LQD', 'UVXY', 'VXX']

# new etfs and stocks under consideration
new_etfs = ['XBI', 'XLK', 'XLY', 'XLF', 'XLE', 'XOP', 'ARKK', 'TBT', 'ARKG', 'SMH']
new_stocks = []

# potential earnings play
earning_stocks = []


# etf_list is used to filter earnings report from TipRanks
etf_list = etf_holdings + etf_trades + index_etfs + new_etfs
stock_list = stock_holdings + stock_trades + new_stocks

master_list = etf_list + stock_list + earning_stocks

# remove duplicates while keeping order
securities = sorted(set(master_list), key=master_list.index)

# debug

#securities = set(etf_holdings + stock_holdings)
#securities = set(['AAPL'])

In [4]:
#print(sorted(set(earning_stocks)))

In [5]:
# Get Historical Earnings and price change from Earnings from TipRanks
def get_earnings_hist_from_tipranks(security):
    
    try:
        # Get Earnings data for specific security from TipRanks

        url = 'https://www.tipranks.com/stocks/{}/earnings'
        header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}

        r = requests.get(url.format(security), headers=header)


        # Eaxtract data from Earnings History table
        eps_hist_df = pd.read_html(r.content, match="EPS YoY Change", index_col=None, parse_dates=True)
        eps_hist_df = eps_hist_df[0]

        df = pd.DataFrame({
            'report_date': pd.to_datetime(eps_hist_df['Report Date']), 
            'eps_prev_yr': eps_hist_df["Last Year's EPS"]
        })

        df = df.join([
            eps_hist_df['Fiscal Quarter'].str.split(' ', expand=True).rename(columns={0: 'fiscal_yr', 1: 'fiscal_qtr'}),
            eps_hist_df['Forecast / EPS'].str.split('/', expand=True).rename(columns={0: 'eps_forecast', 1: 'eps_actual'}),
            eps_hist_df['EPS YoY Change'].str.split('% ', expand=True).rename(columns={0: 'eps_yoy_pct', 1: 'eps_yoy_chg'})
        ])

        df['fiscal_qtr'] = df['fiscal_qtr'].str[2:3]
        df['eps_yoy_chg'] = df['eps_yoy_chg'].str[1:-1]

        columns=['report_date', 'fiscal_yr', 'fiscal_qtr', 'eps_forecast', 'eps_actual', 'eps_prev_yr', 'eps_yoy_chg', 'eps_yoy_pct']

        df = df.reindex(columns=columns)

        # Extract data from Price Change table
        price_change_df = pd.read_html(r.content, match="Price 1 Day Before", index_col=None, parse_dates=True)
        price_change_df = price_change_df[0]

        df1 = pd.DataFrame({
            'report_date': pd.to_datetime(price_change_df['Report Date']), 
            'price_1d_before': price_change_df['Price 1 Day Before'].str[1:], 
            'price_1d_after': price_change_df['Price 1 Day After'].str[1:],
            'price_pct_change': price_change_df['Percentage Change'].str[:-1]
        })

        # Merge past earnings df and price change due to earnings df
        df2 = df.merge(df1, on='report_date', how='left')

        # Change columns data type
        cols = df2.select_dtypes(include=['object']).columns
        df2[cols] = df2[cols].apply(pd.to_numeric, errors='coerce')
        #display(df2)
    except:
        df2 = None
    
    if ((df2 is not None) and (df2.empty != True)):
        # Save to CSV file
        file_name = 'data/{}_earnings_hist_tipranks.csv'.format(security)
        df2.to_csv(file_name)
        #print('{} TipRanks earnings history data downloaded'.format(security))
    
        return True
    else:
        return False

# Get Historical Corporate Actions using pandas datareader
def get_actions_using_pdr(security):
    # Retrieve Historical Corporate Actions from Yahoo! Finance
    try:
        actions = web.DataReader(security, 'yahoo-actions')
    except:
        actions = None
    
    if ((actions is not None) and (actions.empty != True)):
        # Save to CSV file
        file_name = 'data/{}_actions.csv'.format(security)
        actions.to_csv(file_name)

        return True
    else:
        return False
    
# Get Historical Earnings using yfinance
def get_earnings_hist_using_yf(security):
    x = yf.Ticker(security)
    try:
        earnings_hist = x.earnings_dates
    except:
        earnings_hist = None
    
    if ((earnings_hist is not None) and (earnings_hist.empty != True)):
        # Save to CSV file
        file_name = 'data/{}_earnings_hist.csv'.format(security)
        earnings_hist.to_csv(file_name)

        return True
    else:
        return False

# Get Next Corporate Activity using yfinance
def get_calendar_using_yf(security):
    x = yf.Ticker(security)
    
    calendar = x.calendar
    
    if (calendar is not None):
        # Save to CSV file
        file_name = 'data/{}_calendar.csv'.format(security)
        calendar.to_csv(file_name)
        
        return True
    else:
        return False

# Get Option Data using pandas datareader
def get_options_data_using_pdr(security):
    # Retrieve Options Data from Yahoo! Finance
    session = requests.Session()
    session.headers = DEFAULT_HEADERS

    options = web.YahooOptions(security, session=session)
    df = options.get_all_data()

    # Flatten the option pricing df and save as CSV
    df = df.reset_index()
    
    # [TODO:] The prioritization analysis notebook gives error when 'Expiry' field is missing.
    # Include a check for Expiry and redownload option data.
    # Make sure to not download more than 3 times before giving up.

    # find the latest Quote time
    latest_quote = df.Quote_Time.max()
        
    # Save to CSV file
    file_name = 'data/{}_options_{}.csv'.format(security, latest_quote.strftime('%Y%m%d'))
    df.to_csv(file_name, index=False)
    
    return True

# Get minute level stock price data using yfinance
def get_stock_price_data_using_yf(security, period='1mo', interval='2m'):
    # security = Ticker symbol, string ex: 'AAPL'
    # period = Months/Days of data to retrieve, string ex: '1mo' for 1 month, '1y' for 1 year
    # interval = Interval between price data, string ex: '2m' for 2 minute, '1d' for daily
                
    df = yf.Ticker(security).history(period=period, interval=interval, \
                                           actions=False, auto_adjust=False)
    
    if interval == '2m':
        df = df.sort_values(by='Datetime')
    else:
        df = df.sort_values(by='Date')
        
    df = df.dropna()

    # Save to CSV file
    file_name = 'data/{}_{}_{}.csv'.format(security, interval, \
                                           df.index[-1].strftime('%Y%m%d%H%M%S'))
    df.to_csv(file_name)

    return True

# Get daily stock price data using Pandas Datareader from Yahoo! Finance
def get_stock_price_using_pdr(security, period=1):
    # security = Ticker symbol of stock, string ex: 'TQQQ'
    # period = Years of data to reterive, integer ex: 1 for 1 year
    session = requests.Session()
    session.headers = DEFAULT_HEADERS

    source = 'yahoo' # Source of data
    end_date = datetime.now()
    start_date = end_date.replace(year = end_date.year - period)
    
    # For ^SPX Futures use ^GSPC for daily data
    if (security == '^SPX'):
        security = '^GSPC'

    # temporary fix when DataReader is not retreving data from Yahoo!
    yf.pdr_override()
    
    df = web.DataReader(security, source, start_date, end_date, session=session)
    df = df.sort_values(by='Date')
    df = df.dropna()

    # Change the filename to ^SPX
    if (security == '^GSPC'):
        security = '^SPX'
        
    # Save to CSV file
    file_name = 'data/{}_daily_{}.csv'.format(security, df.index[-1].strftime('%Y%m%d'))
    df.to_csv(file_name)
    
    return True


def process(security):
    t0 = time.time()
#    get_stock_price_using_pdr(security)
    get_stock_price_data_using_yf(security, period='1y', interval='1d')
    get_stock_price_data_using_yf(security)
    get_options_data_using_pdr(security)
    get_calendar_using_yf(security)
    get_earnings_hist_from_tipranks(security)
    get_earnings_hist_using_yf(security)
    get_actions_using_pdr(security)
    time.sleep(1)
    t1 = time.time()
    print('{} Execution time: {} seconds'.format(security, t1 - t0))

In [6]:
t0 = time.time()
print('Download data for {} securities'.format(len(securities)))
Parallel(n_jobs=-1, prefer='threads')(delayed(process)(security) for security in securities)
t1 = time.time()
exec_time = t1 - t0
print('Total Execution time: {} seconds'.format(exec_time))

Download data for 92 securities
SOXL Execution time: 40.523175954818726 seconds
^VIX Execution time: 43.86344289779663 seconds
TQQQ Execution time: 44.88687300682068 seconds
USO Execution time: 45.20221519470215 seconds
SQQQ Execution time: 48.722651958465576 seconds
DIA Execution time: 49.36047315597534 seconds
UPRO Execution time: 50.00217008590698 seconds
^SPX Execution time: 51.00555920600891 seconds
TLT Execution time: 51.92996573448181 seconds
SPY Execution time: 62.43859910964966 seconds
QQQ Execution time: 42.28964614868164 seconds
UVXY Execution time: 38.74529814720154 seconds
VXX Execution time: 35.47165584564209 seconds
LQD Execution time: 40.310534954071045 seconds
IWM Execution time: 43.045650005340576 seconds
XBI Execution time: 40.159279108047485 seconds
XLF Execution time: 39.202617168426514 seconds
XLK Execution time: 41.53636407852173 seconds
XLY Execution time: 42.17439794540405 seconds
XLE Execution time: 42.46304702758789 seconds
ARKG Execution time: 35.83424401283