# 1. 환경설정

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import warnings
warnings.filterwarnings('ignore')
import time
import pickle
from tqdm import tqdm

In [2]:
import pickle
with open('sector.pickle','rb') as f:
    sector = pickle.load(f)

# 2. 일별 종가 데이터 수집 - 10개
키움API는 구름IDE에서 실행이 안되어 따로 웹크롤링

In [3]:
def get_stock_price(code, num_of_pages):
    url = f"http://finance.naver.com/item/sise_day.nhn?code={code}" 
    bs = BeautifulSoup(requests.get(url=url, headers = headers).text, 'html.parser')
    pgrr = bs.find("td", class_="pgRR")
    last_page = int(pgrr.a["href"].split('=')[-1])
    pages = min(last_page, num_of_pages) # 마지막 페이지와 가져올 페이지 수 중에 작은 값 선택
    df = pd.DataFrame()

    for page in range(1, pages+1):
        page_url = '{}&page={}'.format(url, page)
        df = df.append(pd.read_html(requests.get(page_url, headers={'User-agent': 'Mozilla/5.0'}).text)[0])
        
    df['일자'] = pd.to_datetime(df['날짜']) 
    df = df.dropna()
    df[['종가',  '시가', '고가', '저가', '거래량']] = df[['종가', '시가', '고가', '저가', '거래량']].astype(int) # int형으로 변경
    df = df[['일자', '거래량','시가', '고가', '저가','종가']] # 컬럼 순서 정렬
    df = df.sort_values(by = '일자') # 날짜순으로 정렬
    df = df.loc[::-1].reset_index(drop=True)
    return df

In [4]:
headers = {'User-agent': 'Mozilla/5.0'}
pages = 1
df_sector_UTD = {}
for sector_name in sector:
    df_sector_UTD[sector_name] = {}


In [5]:
for sector_name, code_list in sector.items():
    for stock_code in tqdm(code_list):
        df_sector_UTD[sector_name][stock_code] = get_stock_price(stock_code,pages)
        time.sleep(0.01)

100%|███████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  2.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.74it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.76it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.74it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  4.75it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.70it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.84it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.76it/s]
100%|███████████████████████████████████████████████████████████████████████████

# 이전 data와 최신 data 병합(최신화)

In [6]:
with open('df_sector_UTD.pickle','rb') as f:
    df_sector = pickle.load(f)

In [7]:
for sector_name, df_dict in df_sector.items():
    for stock_code, df in df_dict.items():
        df = pd.concat([df_sector_UTD[sector_name][stock_code],df_sector[sector_name][stock_code]],axis=0)
        df=df.drop_duplicates(subset=['일자'])
        df = df.sort_values(by=df.columns[0],ascending=True)
        df = df.reset_index(drop=True)
        df_sector_UTD[sector_name][stock_code] = df

In [8]:
df_sector_UTD['Food']['097950']

Unnamed: 0,일자,거래량,시가,고가,저가,종가
0,2010-06-07,48198,210500,213500,209500,212000
1,2010-06-08,50727,211000,213500,209500,211000
2,2010-06-09,52375,212000,218000,210500,214000
3,2010-06-10,61195,214500,219500,213500,219000
4,2010-06-11,89684,217000,220000,211500,214500
...,...,...,...,...,...,...
3004,2022-08-08,28337,389000,394500,386000,394500
3005,2022-08-09,265521,402000,437000,402000,428500
3006,2022-08-10,66726,426500,428500,414500,423500
3007,2022-08-11,66420,425000,425000,417000,421500


In [9]:
df_sector_UTD

{'Food': {'097950':              일자     거래량      시가      고가      저가      종가
  0    2010-06-07   48198  210500  213500  209500  212000
  1    2010-06-08   50727  211000  213500  209500  211000
  2    2010-06-09   52375  212000  218000  210500  214000
  3    2010-06-10   61195  214500  219500  213500  219000
  4    2010-06-11   89684  217000  220000  211500  214500
  ...         ...     ...     ...     ...     ...     ...
  3004 2022-08-08   28337  389000  394500  386000  394500
  3005 2022-08-09  265521  402000  437000  402000  428500
  3006 2022-08-10   66726  426500  428500  414500  423500
  3007 2022-08-11   66420  425000  425000  417000  421500
  3008 2022-08-12   43541  419000  422000  414000  416500
  
  [3009 rows x 6 columns],
  '271560':              일자      거래량      시가      고가      저가      종가
  0    2017-07-07  1619864   87100   89700   75700   82300
  1    2017-07-10   840451   82300   88000   81000   88000
  2    2017-07-11   422303   88000   88000   82700   82800
  3    201

In [17]:
df_sector_UTD['Food']['097950'].iloc[2999:3007]

Unnamed: 0,일자,거래량,시가,고가,저가,종가
2999,2022-08-01,37653,391500,392000,383500,387000
3000,2022-08-02,29317,387000,388000,383000,385500
3001,2022-08-03,30267,385000,387500,383500,387500
3002,2022-08-04,24225,388000,389500,383500,384500
3003,2022-08-05,26817,386500,391000,385000,389000
3004,2022-08-08,28337,389000,394500,386000,394500
3005,2022-08-09,265521,402000,437000,402000,428500
3006,2022-08-10,66726,426500,428500,414500,423500
