# Download historical equity data for NASDAQ stocks from yahoo finance

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from time import time
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import FinanceDataReader as fdr
import yfinance as yf

In [3]:
idx = pd.IndexSlice

In [4]:
results_path = Path('KR2_results', 'asset_pricing')
if not results_path.exists():
    results_path.mkdir(parents=True)

In [5]:
def chunks(l, n): 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

In [6]:
def format_time(t):
    """Return a formatted time string 'HH:MM:SS
    based on a numeric time() value"""
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return f'{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}'

## Get KRX symbols

In [7]:
krx = fdr.StockListing('KRX')
krx = krx[krx["Market"] != "KONEX"]
krx = krx[krx["Market"] != "KOSDAQ"]
krx['Code'] = krx['Code'] + '.' + krx['Market'].apply(lambda x: 'KS')
krx = krx["Code"].to_list()
n = len(krx)
print(f'# Symbols: {n:,.0f}')

# Symbols: 999


## Download metadata from yahoo finance

### NASDAQ symbols

In [8]:
yf_codes = yf.Tickers(krx)

In [9]:
meta_data = []
start = time()
for code in tqdm(krx):
    try:
        yf_object = yf.Ticker(code)
        s = pd.Series(yf_object.get_info())
        meta_data.append(s.to_frame(code))
    except Exception as e:
        print(code, e)

print(f'Success: {len(meta_data):5,.0f} / {len(krx):5,.0f}')

100%|██████████| 999/999 [09:58<00:00,  1.67it/s] 

Success:   999 /   999





meta_data

In [10]:
df = pd.concat(meta_data, axis=1).dropna(how='all').T
df = df.apply(pd.to_numeric, errors='ignore')
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 999 entries, 005930.KS to 001525.KS
Columns: 118 entries, address1 to openInterest
dtypes: float64(89), int64(1), object(28)
memory usage: 928.8+ KB


In [11]:
results_path

WindowsPath('KR2_results/asset_pricing')

In [13]:
df.to_hdf(results_path / 'data.h5', 'stocks/info')

## Download adjusted price data using yfinance

In [14]:
prices_adj = []
start = time()
for i, chunk in enumerate(chunks(krx, 100), 1):
    prices_adj.append(yf.download(chunk, period='max', auto_adjust=True).stack(-1))

    per_ticker = (time()-start) / (i * 100)
    to_do = n - (i * 100)
    to_go = to_do * per_ticker    
    print(f'Success: {len(prices_adj):5,.0f}/{i:5,.0f} | To go: {format_time(to_go)} ({to_do:5,.0f})')

[*********************100%***********************]  100 of 100 completed


6 Failed downloads:
['091990.KS', '263750.KS', '247540.KS', '035900.KS', '022100.KS', '066970.KS']: Exception('%ticker%: No timezone found, symbol may be delisted')



Success:     1/    1 | To go: 00:02:39 (  899)
[*********************100%***********************]  100 of 100 completed


14 Failed downloads:
['039030.KS', '293490.KS', '036930.KS', '278280.KS', '145020.KS', '214150.KS', '237690.KS', '058470.KS', '056190.KS', '196170.KS', '214450.KS', '035760.KS', '240810.KS', '137400.KS']: Exception('%ticker%: No timezone found, symbol may be delisted')



Success:     2/    2 | To go: 00:01:34 (  799)
[*********************100%***********************]  100 of 100 completed


16 Failed downloads:
['272290.KS', '348210.KS', '098460.KS', '319660.KS', '195940.KS', '003380.KS', '086450.KS', '084370.KS', '141080.KS', '213420.KS', '166090.KS', '215000.KS', '067310.KS', '183300.KS', '046890.KS', '074600.KS']: Exception('%ticker%: No timezone found, symbol may be delisted')



Success:     3/    3 | To go: 00:01:08 (  699)
[*********************100%***********************]  100 of 100 completed


8 Failed downloads:
['131290.KS', '091700.KS', '060250.KS', '095610.KS', '243070.KS', '215200.KS', '036830.KS', '084850.KS']: Exception('%ticker%: No timezone found, symbol may be delisted')



Success:     4/    4 | To go: 00:00:53 (  599)
[*********************100%***********************]  100 of 100 completed


2 Failed downloads:
['267980.KS', '230360.KS']: Exception('%ticker%: No timezone found, symbol may be delisted')



Success:     5/    5 | To go: 00:00:42 (  499)
[*********************100%***********************]  100 of 100 completed
Success:     6/    6 | To go: 00:00:32 (  399)
[*********************100%***********************]  100 of 100 completed
Success:     7/    7 | To go: 00:00:23 (  299)
[*********************100%***********************]  100 of 100 completed
Success:     8/    8 | To go: 00:00:15 (  199)
[*********************100%***********************]  100 of 100 completed
Success:     9/    9 | To go: 00:00:07 (   99)
[*********************100%***********************]  99 of 99 completed
Success:    10/   10 | To go: -1:59:60 (   -1)


In [15]:
prices_adj = (pd.concat(prices_adj)
              .dropna(how='all', axis=1)
              .rename(columns=str.lower)
              .swaplevel())

In [16]:
prices_adj.index.names = ['ticker', 'date']

In [17]:
len(prices_adj.index.unique('ticker'))

953

### Remove outliers

In [18]:
df = prices_adj.close.unstack('ticker')
pmax = df.pct_change().max()
pmin = df.pct_change().min()
to_drop = pmax[pmax > 1].index.union(pmin[pmin<-1].index)
len(to_drop)

69

In [19]:
prices_adj = prices_adj.drop(to_drop, level='ticker')

In [20]:
len(prices_adj.index.unique('ticker'))

884

In [21]:
prices_adj

Unnamed: 0_level_0,Unnamed: 1_level_0,close,high,low,open,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000100.KS,2000-01-04,3359.394287,3469.774594,3081.044475,3081.044475,728643.0
000270.KS,2000-01-04,5074.384766,5074.384766,4832.747257,4901.786545,636300.0
000720.KS,2000-01-04,283844.968750,283844.968750,258453.084474,258453.084474,70240.0
000810.KS,2000-01-04,21973.482422,22570.292212,19640.496653,19640.496653,271254.0
003490.KS,2000-01-04,10754.256836,10754.256836,9751.740900,9751.740900,2251546.0
...,...,...,...,...,...,...
35320K.KS,2023-08-07,10740.000000,10920.000000,10580.000000,10920.000000,10408.0
36328K.KS,2023-08-07,7600.000000,8230.000000,7600.000000,7870.000000,2001.0
38380K.KS,2023-08-07,9680.000000,9890.000000,9610.000000,9610.000000,10025.0
45014K.KS,2023-08-07,7140.000000,7260.000000,7110.000000,7150.000000,2023.0


In [22]:
prices_adj.sort_index().loc[idx[:, '2000': '2023'], :].to_hdf(results_path / 'data.h5', 
                                                              'stocks/prices/adjusted')

In [23]:
prices_adj

Unnamed: 0_level_0,Unnamed: 1_level_0,close,high,low,open,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000100.KS,2000-01-04,3359.394287,3469.774594,3081.044475,3081.044475,728643.0
000270.KS,2000-01-04,5074.384766,5074.384766,4832.747257,4901.786545,636300.0
000720.KS,2000-01-04,283844.968750,283844.968750,258453.084474,258453.084474,70240.0
000810.KS,2000-01-04,21973.482422,22570.292212,19640.496653,19640.496653,271254.0
003490.KS,2000-01-04,10754.256836,10754.256836,9751.740900,9751.740900,2251546.0
...,...,...,...,...,...,...
35320K.KS,2023-08-07,10740.000000,10920.000000,10580.000000,10920.000000,10408.0
36328K.KS,2023-08-07,7600.000000,8230.000000,7600.000000,7870.000000,2001.0
38380K.KS,2023-08-07,9680.000000,9890.000000,9610.000000,9610.000000,10025.0
45014K.KS,2023-08-07,7140.000000,7260.000000,7110.000000,7150.000000,2023.0
