# Download historical equity data for NASDAQ stocks from yahoo finance

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [65]:
from time import time
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import FinanceDataReader as fdr
import yfinance as yf

In [66]:
idx = pd.IndexSlice

In [67]:
results_path = Path('KR_results', 'asset_pricing')
if not results_path.exists():
    results_path.mkdir(parents=True)

In [68]:
def chunks(l, n): 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

In [69]:
def format_time(t):
    """Return a formatted time string 'HH:MM:SS
    based on a numeric time() value"""
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return f'{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}'

## Get KRX symbols

In [98]:
krx = fdr.StockListing('KRX')
krx = krx[krx["Market"] != "KONEX"]
krx['Code'] = krx['Code'] + '.' + krx['Market'].apply(lambda x: 'KS' if x == 'KOSPI' else 'KQ')
krx = krx["Code"].to_list()
n = len(krx)
print(f'# Symbols: {n:,.0f}')

# Symbols: 2,585


## Download metadata from yahoo finance

### NASDAQ symbols

In [100]:
yf_codes = yf.Tickers(krx)

In [101]:
meta_data = []
start = time()
for code in tqdm(krx):
    try:
        yf_object = yf.Ticker(code)
        s = pd.Series(yf_object.get_info())
        meta_data.append(s.to_frame(code))
    except Exception as e:
        print(code, e)

print(f'Success: {len(meta_data):5,.0f} / {len(krx):5,.0f}')

 62%|██████▏   | 1593/2585 [15:07<07:43,  2.14it/s]

160600.KQ 404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/160600.KQ?modules=summaryProfile%2CfinancialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&ssl=true


 77%|███████▋  | 2002/2585 [18:59<04:38,  2.10it/s]

263540.KQ 404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/263540.KQ?modules=summaryProfile%2CfinancialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&ssl=true


 77%|███████▋  | 2003/2585 [19:00<04:06,  2.37it/s]

141020.KQ 404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/141020.KQ?modules=summaryProfile%2CfinancialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&ssl=true


 82%|████████▏ | 2125/2585 [20:09<03:47,  2.02it/s]

223310.KQ 404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/223310.KQ?modules=summaryProfile%2CfinancialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&ssl=true


 97%|█████████▋| 2516/2585 [23:50<00:35,  1.95it/s]

058220.KQ 404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/058220.KQ?modules=summaryProfile%2CfinancialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&ssl=true


 98%|█████████▊| 2535/2585 [24:01<00:23,  2.10it/s]

121890.KQ 404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/121890.KQ?modules=summaryProfile%2CfinancialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&ssl=true


100%|██████████| 2585/2585 [24:29<00:00,  1.76it/s]

Success: 2,579 / 2,585





meta_data

In [102]:
df = pd.concat(meta_data, axis=1).dropna(how='all').T
df = df.apply(pd.to_numeric, errors='ignore')
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2579 entries, 005930.KS to 050540.KQ
Columns: 118 entries, address1 to state
dtypes: float64(89), int64(1), object(28)
memory usage: 2.3+ MB


In [103]:
results_path

WindowsPath('KR_results/asset_pricing')

In [104]:
df.to_hdf(results_path / 'data.h5', 'stocks/info')

## Download adjusted price data using yfinance

In [105]:
prices_adj = []
start = time()
for i, chunk in enumerate(chunks(krx, 100), 1):
    prices_adj.append(yf.download(chunk, period='max', auto_adjust=True).stack(-1))

    per_ticker = (time()-start) / (i * 100)
    to_do = n - (i * 100)
    to_go = to_do * per_ticker    
    print(f'Success: {len(prices_adj):5,.0f}/{i:5,.0f} | To go: {format_time(to_go)} ({to_do:5,.0f})')

[*********************100%***********************]  100 of 100 completed
Success:     1/    1 | To go: 00:04:18 (2,485)
[*********************100%***********************]  100 of 100 completed
Success:     2/    2 | To go: 00:03:40 (2,385)
[*********************100%***********************]  100 of 100 completed
Success:     3/    3 | To go: 00:03:21 (2,285)
[*********************100%***********************]  100 of 100 completed
Success:     4/    4 | To go: 00:03:07 (2,185)
[*********************100%***********************]  100 of 100 completed
Success:     5/    5 | To go: 00:02:55 (2,085)
[*********************100%***********************]  100 of 100 completed
Success:     6/    6 | To go: 00:02:45 (1,985)
[*********************100%***********************]  100 of 100 completed
Success:     7/    7 | To go: 00:02:36 (1,885)
[*********************100%***********************]  100 of 100 completed
Success:     8/    8 | To go: 00:02:28 (1,785)
[*********************100%**************

In [106]:
prices_adj = (pd.concat(prices_adj)
              .dropna(how='all', axis=1)
              .rename(columns=str.lower)
              .swaplevel())

In [107]:
prices_adj.index.names = ['ticker', 'date']

In [108]:
len(prices_adj.index.unique('ticker'))

2544

### Remove outliers

In [109]:
df = prices_adj.close.unstack('ticker')
pmax = df.pct_change().max()
pmin = df.pct_change().min()
to_drop = pmax[pmax > 1].index.union(pmin[pmin<-1].index)
len(to_drop)

183

In [110]:
prices_adj = prices_adj.drop(to_drop, level='ticker')

In [111]:
len(prices_adj.index.unique('ticker'))

2361

In [112]:
prices_adj.sort_index().loc[idx[:, '1990': '2019'], :].to_hdf(results_path / 'data.h5', 
                                                              'stocks/prices/adjusted')