# 1. 환경설정

In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import warnings
warnings.filterwarnings('ignore')
import time
import pickle
from tqdm import tqdm

In [6]:
import pickle
with open('sector.pickle','rb') as f:
    sector = pickle.load(f)

# 2. 일별 종가 데이터 수집 - 10개
키움API는 구름IDE에서 실행이 안되어 따로 웹크롤링

In [7]:
def get_stock_price(code, num_of_pages):
    url = f"http://finance.naver.com/item/sise_day.nhn?code={code}" 
    bs = BeautifulSoup(requests.get(url=url, headers = headers).text, 'html.parser')
    pgrr = bs.find("td", class_="pgRR")
    last_page = int(pgrr.a["href"].split('=')[-1])
    pages = min(last_page, num_of_pages) # 마지막 페이지와 가져올 페이지 수 중에 작은 값 선택
    df = pd.DataFrame()

    for page in range(1, pages+1):
        page_url = '{}&page={}'.format(url, page)
        df = df.append(pd.read_html(requests.get(page_url, headers={'User-agent': 'Mozilla/5.0'}).text)[0])
        
    df['일자'] = pd.to_datetime(df['날짜']) 
    df = df.dropna()
    df[['종가',  '시가', '고가', '저가', '거래량']] = df[['종가', '시가', '고가', '저가', '거래량']].astype(int) # int형으로 변경
    df = df[['일자', '거래량','시가', '고가', '저가','종가']] # 컬럼 순서 정렬
    df = df.sort_values(by = '일자') # 날짜순으로 정렬
    df = df.loc[::-1].reset_index(drop=True)
    return df

In [10]:
headers = {'User-agent': 'Mozilla/5.0'}
pages = 20
df_sector_UTD = {}
for sector_name in sector:
    df_sector_UTD[sector_name] = {}


In [11]:
for sector_name, code_list in sector.items():
    for stock_code in tqdm(code_list):
        df_sector_UTD[sector_name][stock_code] = get_stock_price(stock_code,pages)
        time.sleep(0.01)

100%|██████████| 5/5 [00:11<00:00,  2.39s/it]
100%|██████████| 4/4 [00:09<00:00,  2.40s/it]
100%|██████████| 5/5 [00:12<00:00,  2.41s/it]
100%|██████████| 4/4 [00:09<00:00,  2.43s/it]
100%|██████████| 3/3 [00:07<00:00,  2.40s/it]
100%|██████████| 5/5 [00:12<00:00,  2.58s/it]
100%|██████████| 4/4 [00:09<00:00,  2.50s/it]
100%|██████████| 4/4 [00:09<00:00,  2.43s/it]
100%|██████████| 4/4 [00:09<00:00,  2.47s/it]
100%|██████████| 5/5 [00:12<00:00,  2.40s/it]
100%|██████████| 5/5 [00:12<00:00,  2.50s/it]
100%|██████████| 3/3 [00:07<00:00,  2.50s/it]
100%|██████████| 3/3 [00:07<00:00,  2.43s/it]
100%|██████████| 3/3 [00:07<00:00,  2.37s/it]
100%|██████████| 4/4 [00:09<00:00,  2.45s/it]
100%|██████████| 4/4 [00:09<00:00,  2.43s/it]
100%|██████████| 4/4 [00:09<00:00,  2.40s/it]
100%|██████████| 5/5 [00:12<00:00,  2.42s/it]


# 이전 data와 최신 data 병합(최신화)

In [12]:
with open('df_sector_UTD.pickle','rb') as f:
    df_sector = pickle.load(f)

In [13]:
df_sector

{'Food': {'097950':              일자    거래량      시가      고가      저가      종가
  2999 2010-06-08  50727  211000  213500  209500  211000
  2998 2010-06-09  52375  212000  218000  210500  214000
  2997 2010-06-10  61195  214500  219500  213500  219000
  2996 2010-06-11  89684  217000  220000  211500  214500
  2995 2010-06-14  41941  216000  220000  215500  219000
  ...         ...    ...     ...     ...     ...     ...
  4    2022-07-27  16778  396500  400000  394000  395000
  3    2022-07-28  25944  395500  398000  391000  395000
  2    2022-07-29  27618  397500  397500  389000  391500
  1    2022-08-01  37653  391500  392000  383500  387000
  0    2022-08-02  29305  387000  388000  383000  385500
  
  [3000 rows x 6 columns],
  '271560':              일자      거래량      시가      고가      저가      종가
  1246 2017-07-07  1619864   87100   89700   75700   82300
  1245 2017-07-10   840451   82300   88000   81000   88000
  1244 2017-07-11   422303   88000   88000   82700   82800
  1243 2017-07-12   23

In [14]:
for sector_name, df_dict in df_sector.items():
    for stock_code, df in df_dict.items():
        df = pd.concat([df_sector_UTD[sector_name][stock_code],df_sector[sector_name][stock_code]],axis=0)
        df=df.drop_duplicates(subset=['일자'])
        df = df.sort_values(by=df.columns[0],ascending=True)
        df = df.reset_index(drop=True)
        df_sector_UTD[sector_name][stock_code] = df

In [15]:
df_sector_UTD['Food']['097950']

Unnamed: 0,일자,거래량,시가,고가,저가,종가
0,2010-06-08,50727,211000,213500,209500,211000
1,2010-06-09,52375,212000,218000,210500,214000
2,2010-06-10,61195,214500,219500,213500,219000
3,2010-06-11,89684,217000,220000,211500,214500
4,2010-06-14,41941,216000,220000,215500,219000
...,...,...,...,...,...,...
3005,2022-08-10,66726,426500,428500,414500,423500
3006,2022-08-11,66420,425000,425000,417000,421500
3007,2022-08-12,43606,419000,422000,414000,416500
3008,2022-08-16,31340,418000,420500,416500,418500


In [16]:
df_sector_UTD

{'Food': {'097950':              일자    거래량      시가      고가      저가      종가
  0    2010-06-08  50727  211000  213500  209500  211000
  1    2010-06-09  52375  212000  218000  210500  214000
  2    2010-06-10  61195  214500  219500  213500  219000
  3    2010-06-11  89684  217000  220000  211500  214500
  4    2010-06-14  41941  216000  220000  215500  219000
  ...         ...    ...     ...     ...     ...     ...
  3005 2022-08-10  66726  426500  428500  414500  423500
  3006 2022-08-11  66420  425000  425000  417000  421500
  3007 2022-08-12  43606  419000  422000  414000  416500
  3008 2022-08-16  31340  418000  420500  416500  418500
  3009 2022-08-17  30523  419500  423000  413500  417000
  
  [3010 rows x 6 columns],
  '271560':              일자      거래량      시가      고가      저가      종가
  0    2017-07-07  1619864   87100   89700   75700   82300
  1    2017-07-10   840451   82300   88000   81000   88000
  2    2017-07-11   422303   88000   88000   82700   82800
  3    2017-07-12   23

In [None]:
df_sector_UTD['Food']['097950'].iloc[2999:3007]

In [17]:
with open('df_sector_UTD.pickle','wb') as f:
    pickle.dump(df_sector_UTD,f)