In [None]:
# API KEY를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv
import dart_fss as dart
import os
import pandas as pd
import requests
import zipfile
import io
from lxml import etree

# API KEY 정보로드
load_dotenv()
DART_API_KEY = os.getenv("DART_API_KEY")

In [None]:
dart.set_api_key(api_key=DART_API_KEY)

all = dart.api.filings.get_corp_code()
df = pd.DataFrame(all)
# 한개 종목을 정해서 코드를 찾기
corp_code = df[df["corp_name"] == "삼성물산"].iloc[0, 0]
corp_code

In [None]:
# LangSmith 추적을 설정합니다. https://smith.langchain.com
# !pip install -qU langchain-teddynote
from langchain_teddynote import logging

# 프로젝트 이름을 입력합니다.
logging.langsmith("Spoon")

In [None]:
corp_code = "00545929"

url_json = "https://opendart.fss.or.kr/api/list.json"

params = {
    "crtfc_key": "b0f7f31f54a0f96561f361c405caa204e64c81a1",
    "corp_code": corp_code,
    "bgn_de": "20230101",
    "end_de": "20240630",
    "pblntf_detail_ty": "A001",
}

response = requests.get(url_json, params=params)

data = response.json()

data_list = data.get("list")

df_list = pd.DataFrame(data_list)

if df_list.empty:

    raise ValueError(f"No data found for corporation code:")


# rcept_dt를 datetime 형식으로 변환 및 최신건 추출

df_list["rcept_dt"] = pd.to_datetime(df_list["rcept_dt"])

latest_report = df_list.sort_values("rcept_dt", ascending=False).iloc[0]
print(latest_report.rcept_no)

In [None]:
api_key = "b0f7f31f54a0f96561f361c405caa204e64c81a1"  # dart api
rcp_no = latest_report.rcept_no
url = "https://opendart.fss.or.kr/api/document.xml"
params = {"crtfc_key": api_key, "rcept_no": rcp_no}
response = requests.get(url, params=params)
response.content

In [None]:
def extract_section(root, start_aassocnote, end_aassocnote):
    start_element = root.xpath(
        f"//TITLE[@ATOC='Y' and @AASSOCNOTE='{start_aassocnote}']"
    )[0]
    end_element = root.xpath(f"//TITLE[@ATOC='Y' and @AASSOCNOTE='{end_aassocnote}']")[
        0
    ]

    extracted_elements = []
    current_element = start_element
    while current_element is not None:
        extracted_elements.append(
            etree.tostring(current_element, encoding="unicode", with_tail=True)
        )
        if current_element == end_element:
            break
        current_element = current_element.getnext()

    return "".join(extracted_elements)

In [None]:
with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
    print("ZIP 파일 내용:")
    for file_info in zf.infolist():
        print(file_info.filename)

    audit_fnames = [
        info.filename
        for info in zf.infolist()
        if rcp_no in info.filename and info.filename.endswith(".xml")
    ]
    if not audit_fnames:
        raise ValueError("감사보고서 파일을 찾을 수 없습니다.")
    xml_data = zf.read(audit_fnames[0])
extracted_content = xml_data.decode("utf-8")

with open("extracted_sections10.xml", "w", encoding="utf-8") as f:
    f.write(extracted_content)
print("추출된 섹션들이 'extracted_sections.xml' 파일로 저장되었습니다.")

In [None]:
# XML 파싱
parser = etree.XMLParser(recover=True, encoding="utf-8")
root = etree.fromstring(xml_data, parser)
# 세 부분 추출
extracted_xml1 = extract_section(root, "D-0-3-2-0", "D-0-3-3-0")

with open("extracted1.xml", "w", encoding="utf-8") as f:
    f.write(extracted_xml1)

In [19]:
def extract_section(root, start_aassocnote, end_aassocnote):
    start_elements = root.xpath(
        f"//TITLE[@ATOC='Y' and @AASSOCNOTE='{start_aassocnote}']"
    )
    end_elements = root.xpath(f"//TITLE[@ATOC='Y' and @AASSOCNOTE='{end_aassocnote}']")

    if not start_elements or not end_elements:
        return None  # 시작 또는 끝 요소를 찾지 못한 경우

    start_element = start_elements[0]
    end_element = end_elements[0]

    extracted_elements = []
    current_element = start_element
    while current_element is not None:
        extracted_elements.append(
            etree.tostring(current_element, encoding="unicode", with_tail=True)
        )
        if current_element == end_element:
            break
        current_element = current_element.getnext()

    return "".join(extracted_elements)


api_key = "b0f7f31f54a0f96561f361c405caa204e64c81a1"  # dart api
rcp_no = latest_report.rcept_no
url = "https://opendart.fss.or.kr/api/document.xml"
params = {"crtfc_key": api_key, "rcept_no": rcp_no}
response = requests.get(url, params=params)

with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
    print("ZIP 파일 내용:")
    for file_info in zf.infolist():
        print(file_info.filename)

    audit_fnames = [
        info.filename
        for info in zf.infolist()
        if rcp_no in info.filename and info.filename.endswith(".xml")
    ]
    if not audit_fnames:
        raise ValueError("감사보고서 파일을 찾을 수 없습니다.")
    xml_data = zf.read(audit_fnames[0])

extracted_content = xml_data.decode("utf-8")

# XML 파싱
parser = etree.XMLParser(recover=True, encoding="utf-8")
root = etree.fromstring(xml_data, parser)

# 특정 섹션 추출 시도
extracted_xml1 = extract_section(root, "D-0-3-2-0", "D-0-3-3-0")

if extracted_xml1:
    # 특정 섹션 추출 성공
    with open("extracted_section.xml", "w", encoding="utf-8") as f:
        f.write(extracted_xml1)
    print("추출된 섹션이 'extracted_section.xml' 파일로 저장되었습니다.")
else:
    # 특정 섹션 추출 실패, 전체 XML 저장
    with open("full_extracted_xml.xml", "w", encoding="utf-8") as f:
        f.write(extracted_content)
    print("전체 XML 내용이 'full_extracted_xml.xml' 파일로 저장되었습니다.")

ZIP 파일 내용:
20240313000597_00760.xml
20240313000597.xml
20240313000597_00761.xml
전체 XML 내용이 'full_extracted_xml.xml' 파일로 저장되었습니다.
