# Download historical equity data for NASDAQ stocks from yahoo finance

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from time import time
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import FinanceDataReader as fdr
import yfinance as yf

In [3]:
idx = pd.IndexSlice

In [4]:
results_path = Path('KR2_results', 'asset_pricing')
if not results_path.exists():
    results_path.mkdir(parents=True)

In [5]:
def chunks(l, n): 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

In [6]:
def format_time(t):
    """Return a formatted time string 'HH:MM:SS
    based on a numeric time() value"""
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return f'{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}'

## Get KRX symbols

In [7]:
krx = fdr.StockListing('KRX')
krx = krx[krx["Market"] != "KONEX"]
krx = krx[krx["Market"] != "KOSDAQ"]
krx['Code'] = krx['Code'] + '.' + krx['Market'].apply(lambda x: 'KS')
krx = krx["Code"].to_list()
n = len(krx)
print(f'# Symbols: {n:,.0f}')

# Symbols: 998


## Download metadata from yahoo finance

### NASDAQ symbols

In [8]:
yf_codes = yf.Tickers(krx)

In [9]:
meta_data = []
start = time()
for code in tqdm(krx):
    try:
        yf_object = yf.Ticker(code)
        s = pd.Series(yf_object.get_info())
        meta_data.append(s.to_frame(code))
    except Exception as e:
        print(code, e)

print(f'Success: {len(meta_data):5,.0f} / {len(krx):5,.0f}')

  0%|          | 0/998 [00:00<?, ?it/s]

100%|██████████| 998/998 [10:10<00:00,  1.63it/s]  

Success:   998 /   998





meta_data

In [10]:
df = pd.concat(meta_data, axis=1).dropna(how='all').T
df = df.apply(pd.to_numeric, errors='ignore')
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 998 entries, 005930.KS to 001525.KS
Columns: 117 entries, address1 to openInterest
dtypes: float64(89), int64(1), object(27)
memory usage: 920.0+ KB


In [11]:
results_path

WindowsPath('KR2_results/asset_pricing')

In [12]:
df.to_hdf(results_path / 'data.h5', 'stocks/info')

## Download adjusted price data using yfinance

In [13]:
prices_adj = []
start = time()
for i, chunk in enumerate(chunks(krx, 100), 1):
    prices_adj.append(yf.download(chunk, period='max', auto_adjust=True).stack(-1))

    per_ticker = (time()-start) / (i * 100)
    to_do = n - (i * 100)
    to_go = to_do * per_ticker    
    print(f'Success: {len(prices_adj):5,.0f}/{i:5,.0f} | To go: {format_time(to_go)} ({to_do:5,.0f})')

[*********************100%***********************]  100 of 100 completed


6 Failed downloads:
['247540.KS', '022100.KS', '035900.KS', '263750.KS', '066970.KS', '091990.KS']: Exception('%ticker%: No timezone found, symbol may be delisted')



Success:     1/    1 | To go: 00:00:54 (  898)


Failed to get ticker '051600.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)
Failed to get ticker '267260.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[                       0%                       ]

Failed to get ticker '000120.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[*                      2%                       ]  2 of 100 completed

Failed to get ticker '001040.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[**                     4%                       ]  4 of 100 completed

Failed to get ticker '137310.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)
Failed to get ticker '000150.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[**                     5%                       ]  5 of 100 completed

Failed to get ticker '093370.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)
Failed to get ticker '195870.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[****                   8%                       ]  8 of 100 completed

Failed to get ticker '039030.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)
Failed to get ticker '457190.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[****                   9%                       ]  9 of 100 completed

Failed to get ticker '004000.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)
Failed to get ticker '003410.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[******                12%                       ]  12 of 100 completed

Failed to get ticker '042670.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)
Failed to get ticker '001440.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[******                13%                       ]  13 of 100 completed

Failed to get ticker '016360.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[*******               14%                       ]  14 of 100 completed

Failed to get ticker '010120.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)
Failed to get ticker '282330.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[*******               15%                       ]  15 of 100 completed

Failed to get ticker '058470.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)
Failed to get ticker '012750.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)
Failed to get ticker '006260.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[********              17%                       ]  17 of 100 completed

Failed to get ticker '139130.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[*********             19%                       ]  19 of 100 completed

Failed to get ticker '298040.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[**********            20%                       ]  20 of 100 completed

Failed to get ticker '139480.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[**********            21%                       ]  21 of 100 completed

Failed to get ticker '088350.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[***********           22%                       ]  22 of 100 completed

Failed to get ticker '000990.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[***********           23%                       ]  23 of 100 completed

Failed to get ticker '237690.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[************          24%                       ]  24 of 100 completed

Failed to get ticker '002380.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[************          25%                       ]  25 of 100 completed

Failed to get ticker '007310.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[************          26%                       ]  26 of 100 completed

Failed to get ticker '017800.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[************          26%                       ]  26 of 100 completed

Failed to get ticker '035760.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[*************         28%                       ]  28 of 100 completed

Failed to get ticker '006110.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[**************        29%                       ]  29 of 100 completed

Failed to get ticker '026960.KS' reason: HTTPSConnectionPool(host='query2.finance.yahoo.com', port=443): Read timed out. (read timeout=10)


[*********************100%***********************]  100 of 100 completed


41 Failed downloads:
['051600.KS', '267260.KS', '000120.KS', '001040.KS', '137310.KS', '000150.KS', '093370.KS', '195870.KS', '039030.KS', '457190.KS', '004000.KS', '003410.KS', '042670.KS', '001440.KS', '016360.KS', '010120.KS', '282330.KS', '058470.KS', '012750.KS', '006260.KS', '139130.KS', '298040.KS', '139480.KS', '088350.KS', '000990.KS', '237690.KS', '002380.KS', '007310.KS', '017800.KS', '035760.KS', '006110.KS', '026960.KS', '278280.KS', '137400.KS', '196170.KS', '056190.KS', '214450.KS', '145020.KS', '240810.KS', '214150.KS', '293490.KS']: Exception('%ticker%: No timezone found, symbol may be delisted')



Success:     2/    2 | To go: 00:02:39 (  798)
[*********************100%***********************]  100 of 100 completed


16 Failed downloads:
['067310.KS', '213420.KS', '215000.KS', '183300.KS', '348210.KS', '141080.KS', '098460.KS', '319660.KS', '166090.KS', '084370.KS', '195940.KS', '003380.KS', '074600.KS', '272290.KS', '036930.KS', '046890.KS']: Exception('%ticker%: No timezone found, symbol may be delisted')



Success:     3/    3 | To go: 00:01:43 (  698)
[*********************100%***********************]  100 of 100 completed


9 Failed downloads:
['215200.KS', '243070.KS', '036830.KS', '091700.KS', '095610.KS', '060250.KS', '131290.KS', '084850.KS', '086450.KS']: Exception('%ticker%: No timezone found, symbol may be delisted')



Success:     4/    4 | To go: 00:01:14 (  598)
[*********************100%***********************]  100 of 100 completed


2 Failed downloads:





['230360.KS', '267980.KS']: Exception('%ticker%: No timezone found, symbol may be delisted')


Success:     5/    5 | To go: 00:00:55 (  498)
[*********************100%***********************]  100 of 100 completed
Success:     6/    6 | To go: 00:00:40 (  398)
[*********************100%***********************]  100 of 100 completed
Success:     7/    7 | To go: 00:00:27 (  298)
[*********************100%***********************]  100 of 100 completed
Success:     8/    8 | To go: 00:00:17 (  198)
[*********************100%***********************]  100 of 100 completed
Success:     9/    9 | To go: 00:00:08 (   98)
[*********************100%***********************]  98 of 98 completed
Success:    10/   10 | To go: -1:59:60 (   -2)


In [14]:
prices_adj = (pd.concat(prices_adj)
              .dropna(how='all', axis=1)
              .rename(columns=str.lower)
              .swaplevel())

In [15]:
prices_adj.index.names = ['ticker', 'date']

In [16]:
len(prices_adj.index.unique('ticker'))

924

### Remove outliers

In [17]:
df = prices_adj.close.unstack('ticker')
pmax = df.pct_change().max()
pmin = df.pct_change().min()
to_drop = pmax[pmax > 1].index.union(pmin[pmin<-1].index)
len(to_drop)

76

In [18]:
prices_adj = prices_adj.drop(to_drop, level='ticker')

In [19]:
len(prices_adj.index.unique('ticker'))

848

In [20]:
prices_adj

Unnamed: 0_level_0,Unnamed: 1_level_0,close,high,low,open,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000100.KS,2000-01-04,3359.393799,3469.774090,3081.044028,3081.044028,728643.0
000270.KS,2000-01-04,5074.385254,5074.385254,4832.747722,4901.787017,636300.0
000720.KS,2000-01-04,283844.937500,283844.937500,258453.056019,258453.056019,70240.0
000810.KS,2000-01-04,21973.480469,22570.290206,19640.494907,19640.494907,271254.0
003490.KS,2000-01-04,10754.258789,10754.258789,9751.742671,9751.742671,2251546.0
...,...,...,...,...,...,...
36328K.KS,2023-07-24,8440.000000,8500.000000,8360.000000,8430.000000,2437.0
37550K.KS,2023-07-24,19020.000000,19370.000000,19010.000000,19370.000000,1593.0
38380K.KS,2023-07-24,9710.000000,9870.000000,9700.000000,9810.000000,2907.0
45014K.KS,2023-07-24,7470.000000,7800.000000,7350.000000,7800.000000,4008.0


In [21]:
prices_adj.sort_index().loc[idx[:, '2000': '2023'], :].to_hdf(results_path / 'data.h5', 
                                                              'stocks/prices/adjusted')

In [22]:
prices_adj

Unnamed: 0_level_0,Unnamed: 1_level_0,close,high,low,open,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000100.KS,2000-01-04,3359.393799,3469.774090,3081.044028,3081.044028,728643.0
000270.KS,2000-01-04,5074.385254,5074.385254,4832.747722,4901.787017,636300.0
000720.KS,2000-01-04,283844.937500,283844.937500,258453.056019,258453.056019,70240.0
000810.KS,2000-01-04,21973.480469,22570.290206,19640.494907,19640.494907,271254.0
003490.KS,2000-01-04,10754.258789,10754.258789,9751.742671,9751.742671,2251546.0
...,...,...,...,...,...,...
36328K.KS,2023-07-24,8440.000000,8500.000000,8360.000000,8430.000000,2437.0
37550K.KS,2023-07-24,19020.000000,19370.000000,19010.000000,19370.000000,1593.0
38380K.KS,2023-07-24,9710.000000,9870.000000,9700.000000,9810.000000,2907.0
45014K.KS,2023-07-24,7470.000000,7800.000000,7350.000000,7800.000000,4008.0
