# 1. 환경설정

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import warnings
warnings.filterwarnings('ignore')
import time
import pickle
from tqdm import tqdm

In [2]:
import pickle
with open('sector.pickle','rb') as f:
    sector = pickle.load(f)

In [3]:
df

{'Food': ['097950', '271560', '000080', '004370', '005300'],
 'Clothing': ['093050', '020000', '105630', '001070'],
 'Chemical': ['051910', '096770', '010950', '051900', '090430'],
 'Medicine': ['207940', '068270', '000100', '128940'],
 'Non_Metal': ['003670', '003410', '010780'],
 'Metal': ['005490', '010130', '004020', '016380', '001230'],
 'Machine': ['034020', '018880', '241560', '112610'],
 'Electronic': ['005930', '000660', '006400', '066570'],
 'Construction': ['000720', '006360', '047040', '051600'],
 'Transport': ['011200', '003490', '086280', '180640', '028670'],
 'Distribution': ['028260', '023530', '282330', '139480', '004170'],
 'Power': ['015760', '036460', '017390'],
 'Tele': ['017670', '030200', '032640'],
 'Finance': ['003550', '000810', '006800'],
 'Brokerage': ['005940', '016360', '008560', '039490'],
 'Insurer': ['032830', '005830', '000060', '001450'],
 'Service': ['035420', '035720', '018260', '036570'],
 'Manufacturer': ['005380', '000270', '012330', '033780', '0

# 2. 일별 종가 데이터 수집 - 10개
키움API는 구름IDE에서 실행이 안되어 따로 웹크롤링

In [4]:
def get_stock_price(code, num_of_pages):
    url = f"http://finance.naver.com/item/sise_day.nhn?code={code}" 
    bs = BeautifulSoup(requests.get(url=url, headers = headers).text, 'html.parser')
    pgrr = bs.find("td", class_="pgRR")
    last_page = int(pgrr.a["href"].split('=')[-1])
    pages = min(last_page, num_of_pages) # 마지막 페이지와 가져올 페이지 수 중에 작은 값 선택
    df = pd.DataFrame()

    for page in range(1, pages+1):
        page_url = '{}&page={}'.format(url, page)
        df = df.append(pd.read_html(requests.get(page_url, headers={'User-agent': 'Mozilla/5.0'}).text)[0])
        
    df['일자'] = pd.to_datetime(df['날짜']) 
    df = df.dropna()
    df[['종가',  '시가', '고가', '저가', '거래량']] = df[['종가', '시가', '고가', '저가', '거래량']].astype(int) # int형으로 변경
    df = df[['일자', '거래량','시가', '고가', '저가','종가']] # 컬럼 순서 정렬
    df = df.sort_values(by = '일자') # 날짜순으로 정렬
    df = df.loc[::-1].reset_index(drop=True)
    return df

In [5]:
headers = {'User-agent': 'Mozilla/5.0'}
pages = 1
df_sector_UTD = {}
for sector_name in sector:
    df_sector_UTD[sector_name] = {}


In [6]:
for sector_name, code_list in sector.items():
    for stock_code in tqdm(code_list):
        df_sector_UTD[sector_name][stock_code] = get_stock_price(stock_code,pages)
        time.sleep(0.01)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.48it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.75it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.55it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.92it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  4.75it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.76it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.75it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.67it/s]
100%|███████████████████████████████████

# 이전 data와 최신 data 병합(최신화)

In [7]:
with open('df_sector.pickle','rb') as f:
    df_sector = pickle.load(f)

In [8]:
for sector_name, df_dict in df_sector.items():
    for stock_code, df in df_dict.items():
        df = pd.concat([df_sector_UTD[sector_name][stock_code],df_sector[sector_name][stock_code]],axis=0)
        df=df.drop_duplicates(subset=['일자'])
        df = df.sort_values(by=df.columns[0],ascending=True)
        df = df.reset_index(drop=True)
        df_sector[sector_name][stock_code] = df

In [9]:
df_sector['Food']['097950']

Unnamed: 0,일자,거래량,시가,고가,저가,종가
10,2010-06-07,48198,210500,213500,209500,212000
11,2010-06-08,50727,211000,213500,209500,211000
12,2010-06-09,52375,212000,218000,210500,214000
13,2010-06-10,61195,214500,219500,213500,219000
14,2010-06-11,89684,217000,220000,211500,214500
...,...,...,...,...,...,...
4,2022-07-26,19443,395000,400500,395000,397000
3,2022-07-27,16778,396500,400000,394000,395000
2,2022-07-28,25944,395500,398000,391000,395000
1,2022-07-29,27618,397500,397500,389000,391500


In [10]:
with open('df_sector_UTD.pickle','wb') as f:
    pickle.dump(df_sector,f)