# Ticker quotes: get historical prices, divedends, splits from Yahoo

## The project initialization

In [1]:
import codecs
import csv
import datetime as dt
import numpy as np
import os
import pandas as pd
import urllib.request, urllib.parse, urllib.error

### Set network handlers and global vars

In [2]:
_cookier = urllib.request.HTTPCookieProcessor()
_opener = urllib.request.build_opener(_cookier)
urllib.request.install_opener(_opener)

_cookie = None
_crumb = None

_headers={
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    'Referer': 'https://www.google.de/'
}

### Target dates
_Please setup the time frame for your analysis. The start and the end dates should be formated "yyyy-mm-dd"._

In [3]:
start_date = dt.datetime.strptime('2006-01-01', '%Y-%m-%d')
end_date = dt.datetime.strptime('2021-06-30', '%Y-%m-%d') 

### Set paths to data folders

In [4]:
data_raw_path = os.path.join('..', '..', 'data_raw')
data_processing_path = os.path.join('..', '..', 'data_processing')

# Download quotes
_The following chapter will downlad historical prices, data about splits, and dividends for a given list of tickers_

In [5]:
dates_s = pd.date_range(start=start_date, end=end_date)
dates_df = pd.DataFrame({'Date': dates_s, 'Adj Coef': np.ones(len(dates_s))})

### Define functions

<i> The function get_cookie_crumb request a page from finance.yahoo.com with a quote for an orbitrary ticker and set crumb from the response to the global var _crumb together with setting _cookies for further requests</i>

In [6]:
def set_cookie_crumb():
    req = urllib.request.Request('https://finance.yahoo.com/quote/AAPL', headers=_headers)
    f = urllib.request.urlopen(req)
    alines = f.read().decode('utf-8')

    global _crumb
    cs = alines.find('CrumbStore')
    cr = alines.find('crumb', cs + 10)
    cl = alines.find(':', cr + 5)
    q1 = alines.find('"', cl + 1)
    q2 = alines.find('"', q1 + 1)
    crumb = alines[q1 + 1:q2]
    _crumb = codecs.decode(crumb, 'unicode-escape')

    global _cookier, _cookie
    for c in _cookier.cookiejar:
        if c.domain != '.yahoo.com':
            continue
        if c.name != 'B':
            continue
        _cookie = c.value

<i>The get_yahoo_quote function load info for a ticker from Yahoo and returns it as a Panda data frame.</i>
possible info values: 
* 'history',
* 'div',
* 'split'

_use the following dictionary to set correct info type_

In [7]:
_info_types = {
    'History': 'history',
    'Dividend': 'div',
    'Split': 'split'
}

<i>*FYI* get_yahoo_quote will:
- sort data by date ASC; 
- fill "null" in 2 iterations, at first backward and forward after that;
- reset index.
</i>

In [8]:
def get_yahoo_quote(ticker, begindate, enddate, info = 'history', debugurl = False):
    global _cookie, _crumb
    if _cookie == None or _crumb == None:
        set_cookie_crumb()

    params = urllib.parse.urlencode({
        'period1': int(begindate.timestamp()),
        'period2': int(enddate.timestamp()),
        'interval': '1d',
        'events': info,
        'crumb': _crumb
    })
    url = 'https://query1.finance.yahoo.com/v7/finance/download/{}?{}'.format(ticker, params)
    
    if debugurl:
        return url
    
    req = urllib.request.Request(url, headers=_headers)

    f = urllib.request.urlopen(req)
    alines = f.read().decode('utf-8')
    map_not_digints_to_none = lambda x: x and x != 'null' and x or None

    nested_alines = [map(map_not_digints_to_none, line.split(',')) for line in alines.split('\n')[1:]]
    cols = alines.split('\n')[0].split(',')
    adf = pd.DataFrame.from_records(nested_alines[:-1], columns=cols)
    
    adf['Date'] = pd.to_datetime(adf['Date'])
    adf.sort_values('Date', inplace = True)
    adf.reset_index(drop = True, inplace = True)
    
    if info == _info_types['History']:
        adf = pd.merge(dates_df, adf, how='outer', on=['Date'])
    
    adf.set_index('Date', inplace=True)

    adf.fillna(inplace = True, method = 'ffill')
    adf.fillna(inplace = True, method = 'bfill')

    if len(adf[adf.isnull().any(axis=1)]):
        print("Can't fix NaN in %s" % ticker)

    return adf

<i>The save_historical_prices function get historical data for a ticker and save in as CSV.</i>

In [9]:
def save_historical_data(ticker_dict, patht_output_dir, info_type):
    data_result = None
    try:
        data_result = get_yahoo_quote(ticker_dict['ticker_yahoo'], start_date, end_date, info = info_type)
    except Exception as ex:
        print("Error for ticker %s, yahoo code %s : %s" % (ticker_dict['ticker_index'], ticker_dict['ticker_yahoo'], ex))

    if data_result is not None:
        file_result = os.path.join(patht_output_dir, "%s.csv" % ticker_dict['ticker_index'])
        data_result.to_csv(file_result, encoding='utf-8')
    else:
        print("Error for ticker %s, yahoo code %s : Data is empty" % (ticker_dict['ticker_index'], ticker_dict['ticker_yahoo']))

### Get historical data

_set a path to the tickers.csv file._

In [10]:
tickers_file = os.path.join(data_processing_path, 'tickers', 'all_tickers.csv')

#### Get prices ####

_the folloving code will read all tickers from ticker.csv file and download historical prices for every ticker in it_

In [11]:
price_path = os.path.join(data_raw_path, 'prices', 'yahoo')

if not os.path.isdir(price_path):
    os.mkdir(price_path)

with open(tickers_file, 'r') as f_r:
    reader = csv.DictReader(f_r)
    for ticker_dict in reader:
        save_historical_data(ticker_dict, price_path, _info_types['History'])


#### Get dividends ####

_the folloving code will read all tickers from ticker.csv file and download dividends data for every ticker in it_

In [12]:
price_path = os.path.join(data_raw_path, 'prices', 'yahoo_dividends')

if not os.path.isdir(price_path):
    os.mkdir(price_path)

with open(tickers_file, 'r') as f_r:
    reader = csv.DictReader(f_r)
    for ticker_dict in reader:
        save_historical_data(ticker_dict, price_path, _info_types['Dividend'])

#### Get splits ####

_the folloving code will read all tickers from ticker.csv file and download splits data for every ticker in it_

In [13]:
price_path = os.path.join(data_raw_path, 'prices', 'yahoo_splits')

if not os.path.isdir(price_path):
    os.mkdir(price_path)

with open(tickers_file, 'r') as f_r:
    reader = csv.DictReader(f_r)
    for ticker_dict in reader:
        save_historical_data(ticker_dict, price_path, _info_types['Split'])

# Testing section

_use it to test Crumb and Cookies_

In [14]:
set_cookie_crumb()
print("Cookeie: %s" % _cookie)
print("Crumb: %s" % _crumb)

Cookeie: 4185kepge3dni&b=3&s=ed
Crumb: BH7rS9TemmO


_use it to test load url_

In [15]:
get_yahoo_quote('AAPL', start_date, end_date, debugurl = True)

'https://query1.finance.yahoo.com/v7/finance/download/AAPL?period1=1136070000&period2=1625004000&interval=1d&events=history&crumb=BH7rS9TemmO'

_use it to request from yahho for a single ticker_

In [16]:
data = get_yahoo_quote('AAPL', start_date, end_date)
data.head(3)

Unnamed: 0_level_0,Adj Coef,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-01-01,1.0,2.585,2.669643,2.580357,2.669643,2.295634,807234400
2006-01-02,1.0,2.585,2.669643,2.580357,2.669643,2.295634,807234400
2006-01-03,1.0,2.585,2.669643,2.580357,2.669643,2.295634,807234400


In [17]:
data.tail(3)

Unnamed: 0_level_0,Adj Coef,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-06-28,1.0,133.410004,135.25,133.350006,134.779999,134.779999,62111300
2021-06-29,1.0,133.410004,135.25,133.350006,134.779999,134.779999,62111300
2021-06-30,1.0,133.410004,135.25,133.350006,134.779999,134.779999,62111300


_use it to get and save a ticker_

In [18]:
price_path = os.path.join(data_raw_path, 'prices', 'yahoo')
save_historical_data({
    'ticker': 'RI',
    'ticker_index': 'RI_CAC',
    'ticker_yahoo': 'RI.PA',
    'ticker_quandl': 'EURONEXT/RI'
}, price_path, _info_types['History'])