## 파이썬 BeautifulSoup를 활용한 웹 크롤링

- https://comp.fnguide.com/SVO2/ASP/SVD_Finance.asp?pGB=1&gicode=A005930&cID=&MenuYn=Y&ReportGB=&NewMenuID=103&stkGb=701  
    위의 사이트를 크롤링  
   
 - 손익계산서 크롤링

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import sqlalchemy

In [2]:
stock = ['005930', 'KOSPI', '삼성전자']
rpt_type = 'CONSOLIDATED'
freq = 'Q'

In [3]:
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.151 "
                      "Whale/3.14.134.62 Safari/537.36"
    }
if rpt_type.upper() == 'CONSOLIDATED':  # 연결 연간/분기 손익 계산서
    url = "https://comp.fnguide.com/SVO2/ASP/SVD_Finance.asp?pGB=1&gicode=A{}&cID=&MenuYn=Y&ReportGB=D&NewMenuID=103&stkGb=701".format(
        stock[0])
else:  # 별도 연간/분기 손익계산서
    url = "https://comp.fnguide.com/SVO2/ASP/SVD_Finance.asp?pGB=1&gicode=A{}&cID=&MenuYn=Y&ReportGB=B&NewMenuID=103&stkGb=701".format(
        stock[0])

In [4]:
req = Request(url=url, headers=headers)
html = urlopen(req).read()
soup = BeautifulSoup(html, 'html.parser')

In [5]:
html

b'\r\n<!DOCTYPE html>\r\n<html lang="ko">\r\n<head>\r\n    \r\n<link rel="stylesheet" href="https://cdn.fnguide.com/SVO2/css/compeach.css?ver2109131021">\r\n<!--[if lte IE 9]><link rel="stylesheet" href="https://cdn.fnguide.com/SVO2/css/ie9.css?ver3"><![endif]-->\r\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\r\n<meta charset="utf-8">\r\n<meta content="width=device-width" name="viewport">\r\n<!-- <meta content="width=device-width,initial-scale=1.0,minimum-scale=1.0,maximum-scale=1.0" name="viewport"> -->\r\n<script type="text/javascript" src="https://cdn.fnguide.com/SVO2/js/lib/jquery-1.11.1.min.js?v=1.0.6"></script>\r\n<script type="text/javascript" src="https://cdn.fnguide.com/SVO2/js/comp_load.js?v=1.0.6"></script>\r\n<script type="text/javascript" src="https://cdn.fnguide.com/SVO2/js/lib/jquery.autocomplete.js?v=1.0.6"></script>\r\n<link rel="stylesheet" type="text/css" href="https://cdn.fnguide.com/SVO2/css/jquery.autocomplete.css?v=1.0.1" />\r\n<LINK REL="SHORTCUT ICON"

In [6]:
if freq.upper() == 'A':  # 연간 손익계산서 영역
    is_a = soup.find(id='divSonikY')
    num_col = 4  # 최근 4개 데이터
else:  # 분기 손익계산서 영역
    is_a = soup.find(id='divSonikQ')
    num_col = 4  # 최근 4개 데이터
is_a = is_a.find_all(['tr'])
is_a

[<tr>
 <th class="clf tbold" scope="col">IFRS(연결)</th>
 <th scope="col">2021/06</th>
 <th scope="col">2021/09</th>
 <th scope="col">2021/12</th>
 <th scope="col">2022/03</th>
 <th scope="col">전년동기</th>
 <th class="cle" scope="col">전년동기(%)</th>
 </tr>,
 <tr class="rwf rowBold">
 <th class="l clf" scope="row">
 <div class="th_b">매출액</div>
 </th>
 <td class="r" title="636,715.85">636,716</td>
 <td class="r" title="739,791.87">739,792</td>
 <td class="r" title="765,655.24">765,655</td>
 <td class="r" title="777,814.98">777,815</td>
 <td class="r" title="653,885.03">653,885</td>
 <td class="r cle">19.0</td>
 </tr>,
 <tr class="rwf">
 <th class="l clf" scope="row">
 <div class="">매출원가</div>
 </th>
 <td class="r" title="370,659.31">370,659</td>
 <td class="r" title="428,988.71">428,989</td>
 <td class="r" title="449,465.55">449,466</td>
 <td class="r" title="470,720.57">470,721</td>
 <td class="r" title="414,999.85">415,000</td>
 <td class="r cle">13.4</td>
 </tr>,
 <tr class="rwf">
 <th clas

In [7]:
items_kr = [is_a[m].find(['th']).get_text().replace('\n', '').replace('\xa0', '').replace('계산에 참여한 계정 펼치기', '')
                for m in range(1, len(is_a))]
items_kr

['매출액',
 '매출원가',
 '매출총이익',
 '판매비와관리비',
 '인건비',
 '유무형자산상각비',
 '연구개발비',
 '광고선전비',
 '판매비',
 '관리비',
 '기타원가성비용',
 '기타',
 '영업이익',
 '영업이익(발표기준)',
 '금융수익',
 '이자수익',
 '배당금수익',
 '외환이익',
 '대손충당금환입액',
 '매출채권처분이익',
 '당기손익-공정가치측정금융자산관련이익',
 '금융자산처분이익',
 '금융자산평가이익',
 '금융자산손상차손환입',
 '파생상품이익',
 '기타금융수익',
 '금융원가',
 '이자비용',
 '외환손실',
 '대손상각비',
 '당기손익-공정가치측정금융자산관련손실',
 '매출채권처분손실',
 '금융자산처분손실',
 '금융자산평가손실',
 '금융자산손상차손',
 '파생상품손실',
 '기타금융원가',
 '기타수익',
 '이자수익',
 '배당금수익',
 '외환이익',
 '재고자산감모손실환입',
 '재고자산폐기(처분)이익',
 '당기손익-공정가치측정금융자산평가이익',
 '자산처분(폐기)이익',
 '자산평가이익',
 '자산손상차손환입',
 '파생상품이익',
 '임대료수익',
 '로열티수익',
 '수수료수익',
 '대손충당금환입',
 '충당부채환입액',
 '기타',
 '기타비용',
 '이자비용',
 '외환손실',
 '재고자산감모손실',
 '재고자산폐기(처분)손실',
 '당기손익-공정가치측정금융자산평가손실',
 '자산처분(폐기)손실',
 '자산평가손실',
 '자산손상차손',
 '파생상품손실',
 '기타대손상각비',
 '충당부채전입액',
 '기타',
 '종속기업,공동지배기업및관계기업관련손익',
 '지분법손익',
 '종속기업,공동지배기업및관계기업투자주식처분손익',
 '종속기업,공동지배기업및관계기업투자주식손상관련손익',
 '기타',
 '세전계속사업이익',
 '법인세비용',
 '계속영업이익',
 '중단영업이익',
 '당기순이익',
 '지배주주순이익',
 '비지배주주순이익']

In [8]:
period = [is_a[0].find_all('th')[n].get_text() for n in range(1, num_col + 1)]
period = [period[m].replace('/', '-') for m in range(0, len(period))]
_period = []
for p in period:
    _period.append(datetime.strptime(str(p)[:7], '%Y-%m').date())

globals()['period'] = _period

In [9]:
for item, i in zip(items_kr, range(1, len(is_a))):
        temps = []
        for j in range(0, num_col):
            temp = [float(is_a[i].find_all('td')[j]['title'].replace(',', '').replace('\xa0', '')) \
                        if is_a[i].find_all('td')[j]['title'].replace(',', '').replace('\xa0', '') != '' \
                        else (0 if is_a[i].find_all('td')[j]['title'].replace(',', '').replace('\xa0', '') == '-0' \
                                  else 0)]
            temps.append(temp[0])

        globals()[item] = temps

In [10]:
is_domestic = pd.DataFrame({
    '종목코드': stock[0],
    '종목명': stock[2],
    '시장': stock[1],
    '기간': _period,
    '컬럼': rpt_type.lower() + '_' + freq.lower(),
    '매출액': 매출액,
    '매출원가': 매출원가,
    '매출총이익': 매출총이익,
    '판매비': 판매비,
    '관리비': 관리비,
    '영업이익': 영업이익,
    '금융수익': 금융수익,
    '금융원가': 금융원가,
    '기타수익': 기타수익,
    '기타비용': 기타비용,
    '관계기업관련손익': globals()['종속기업,공동지배기업및관계기업관련손익'],
    'EBIT': 세전계속사업이익,
    '법인세비용': 법인세비용,
    '계속영업이익': 계속영업이익,
    '중단영업이익': 중단영업이익,
    '당기순이익': 당기순이익,
    '지배주주순이익': globals()['지배주주순이익'],
    '비지배주주순이익': globals()['비지배주주순이익']
}, index=_period)

In [11]:
is_domestic

Unnamed: 0,종목코드,종목명,시장,기간,컬럼,매출액,매출원가,매출총이익,판매비,관리비,...,기타수익,기타비용,관계기업관련손익,EBIT,법인세비용,계속영업이익,중단영업이익,당기순이익,지배주주순이익,비지배주주순이익
2021-06-01,5930,삼성전자,KOSPI,2021-06-01,consolidated_q,636715.85,370659.31,266056.54,26458.35,14686.37,...,7406.68,8768.96,1874.87,128824.3,32479.44,96344.86,0,96344.86,94506.76,1838.1
2021-09-01,5930,삼성전자,KOSPI,2021-09-01,consolidated_q,739791.87,428988.71,310803.16,35628.16,14454.61,...,4251.52,3227.94,2657.46,163558.69,40625.38,122933.31,0,122933.31,120572.07,2361.24
2021-12-01,5930,삼성전자,KOSPI,2021-12-01,consolidated_q,765655.24,449465.55,316189.69,38558.94,18174.77,...,7401.33,5320.41,1288.62,143628.83,35249.35,108379.48,0,108379.48,106431.22,1948.26
2022-03-01,5930,삼성전자,KOSPI,2022-03-01,consolidated_q,777814.98,470720.57,307094.41,35745.75,17303.91,...,7001.93,4531.1,2324.77,150698.4,37452.12,113246.28,0,113246.28,111290.94,1955.34
