In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests as req
from tqdm import tqdm
import os
import re
from tika import parser
import shutil
from os import listdir
from os.path import isfile, join

In [2]:
# 제목
title_list = []
# 첨부 파일
f_name_list = [] # 파일명
link_list = [] # url
cnt = 0 # filename 카운트
# text 변환
txt_list = []

In [None]:
# HTML 컨텐츠 수집
for i in tqdm(range(1, 54)) :
    min_url = f"https://www.bok.or.kr/portal/bbs/B0000245/list.do?menuNo=200761&pageIndex={i}"
    res = req.get(min_url)
    soup = bs(res.content, "html.parser")

    links = soup.select("div.col.m2.s2.x3.fileLink a")
    titles = soup.select("span.titlesub")

    for i in range(len(titles)) :
        title_list.append(titles[i].text)

    for i in links :
        filename = i.get_text().strip()

        # 확장자가 pdf인 파일만 수집
        if filename[-3:] == "pdf" :
            # 파일명
            filename = f"{title_list[cnt]}.pdf"
            f_name_list.append(filename)
            cnt += 1

            # 파일의 주소 값
            file_addr = "http://bok.or.kr" + i["href"]
            link_list.append(file_addr)

print(len(title_list), len(f_name_list), len(link_list))

In [4]:
# 한국은행 금융통화위원회 의사록 페이지 HTML구조가 바뀌기 전까지로 list 길이 수정
title_list = title_list[:300]
f_name_list = f_name_list[:300]
link_list = link_list[:300]

In [None]:
# DataFrame 생성
pd.set_option("display.max_rows", None)
df = pd.DataFrame({
    "title" : title_list,
    "file" : f_name_list,
    "url" : link_list
})
df

In [None]:
# 날짜 컬럼 추가
df["date"] = df["title"].str.replace(r"\s+", "", regex=True).str.extract(r"\((\d{4}\.\d{1,2}\.\d{1,2})\.?\)$")
df["date"] = pd.to_datetime(df["date"], format="%Y.%m.%d")
df = df.reindex(columns=["date", "title", "file", "url"])
df

In [None]:
print(df["date"].count())

In [None]:
df.to_csv("../data/minutes_info.csv", encoding = "utf8")

## PDF 의사록 다운로드

In [None]:
# 첨부파일 다운로드
for i in tqdm(range(len(link_list))) :
    try :
        res = req.get(link_list[i])
        # dir 생성 or 확인
        if not os.path.isdir("../data/pdf/") :
            os.mkdir("../data/pdf/")
        if res.status_code == 200 :
            file_path = os.path.join("../data/pdf/", f_name_list[i])

            # 파일 열기, 쓰기
            open(file_path, "wb").write(res.content)
    except Exception as e :
        print("Error! : ", e)
        continue
print("Complete😇")

## pdf 👉 text

In [None]:
# 폴더 내 pdf파일을 txt 파일로 변경
def pdf2txt(source_folder="../data/pdf/", output_folder="../data/text/") :
    if not os.path.isdir("../data/text/") :
            os.mkdir("../data/text/")
    # 지정 폴더 내 파일 목록 조회 (파일만)
    pdf_files = [f for f in listdir(source_folder) if isfile(join(source_folder, f))]
    
    try :
        for pdf in tqdm(pdf_files) :
            pdf_filepath = source_folder + pdf
            pdf_tmp_filepath = output_folder + "tmp.pdf"

            # pdf 파일을 text로 변환
            shutil.copyfile(pdf_filepath, pdf_tmp_filepath)
            parsedPDF = parser.from_file(pdf_tmp_filepath)["content"]

            #enter 삭제
            parsedPDF = re.sub("\n", "", parser.from_file(pdf_tmp_filepath)["content"])

            output_filepath = (output_folder + pdf).replace(".pdf", ".txt")
            
            with open(output_filepath, "w", -1,"utf-8") as f:
                print(output_filepath)
                f.write(parsedPDF)
                f.close()
    except Exception as e:
        print("오류", e)
        pass

In [None]:
pdf2txt(source_folder="../data/pdf/", output_folder="../data/text/")

## 의사록 섹션 분리

In [None]:
def tidy_sentences(section) :
    sentence_enders = re.compile(r"((?<=[함음됨임봄짐움])(\s*\n|\.|;)|(?<=다)\.)\s*")
    splits = list((m.start(), m.end()) for m in re.finditer(sentence_enders, section))
    starts = [0] + [i[1] for i in splits]
    ends = [i[0] for i in splits]
    sentences = [section[start:end] for start, end in zip(starts[:-1], ends)]
    for i, s in enumerate(sentences):
        sentences[i] = (s.replace("\n", " ").replace(" ", " ")) + "."
    text = "\n".join(sentences) if len(sentences) > 0 else ""
    
    return sentences, text

In [None]:
def preprocess_minutes(minutes) :
    pos = re.search("(.?국내외\s?경제\s?동향.?과 관련하여,?|\(가\).+경제전망.*|\(가\) 국내외 경제동향 및 평가)\n?\s*일부 위원은", minutes, re.MULTILINE)
    s1 = pos.start() if pos else -1
    pos = re.search("(.?외환.?국제금융\s?동향.?과 관련하여.*|\(나\) 외환.국제금융\s?(및 금융시장)?\s?동향)\n?\s*(일부 위원은|대부분의 위원들은)", minutes,re.MULTILINE)
    s2 = pos.start() if pos else -1
    pos = re.search("(.?금융시장\s?동향.?과 관련하여,?|\(다\) 금융시장\s?동향)\n?\s*일부 위원은", minutes, re.MULTILINE)
    s3 = pos.start() if pos else -1
    pos = re.search("((\((다|라)\) )?.?통화정책\s?방향.?에 관한 토론,?|이상과 같은 의견\s?교환을 바탕으로.*통화정책\s?방향.*에.*토론.*)\n?", minutes,re.MULTILINE)
    s4 = pos.start() if pos else -1
    pos = re.search("(\(4\) 정부측 열석자 발언.*)\n?", minutes, re.MULTILINE)
    s5 = pos.start() if pos else -1
    pos = re.search("(\(.*\) 한국은행 기준금리 결정에 관한 위원별 의견\s?개진|이상과 같은 토론에 이어 .* 관한 위원별 의견개진이 있었음.*)\n?", minutes,re.MULTILINE)
    s6 = pos.start() if pos else -1
    positer = re.finditer("(\(\s?.*\s?\) ()(심의결과|토의결론))\n?", minutes, re.MULTILINE)
    s7 = [pos.start() for pos in positer if pos.start() > s6]
    s7 = s7[0] if s7 else -1

    # 국내외 경제동향
    bos = s1
    eos = s2
    section = minutes[bos:eos] if bos >= 0 or eos >= 0 else ""
    pos = re.search("(일부|대부분의) 위원들?은", section, re.MULTILINE)
    bos = pos.start() if pos else -1
    section = section[bos:] if bos >= 0 else section
    section1, section1_txt = tidy_sentences(section)

    # 외환․국제금융 동향
    bos = s2
    eos = s3 if s3 >= 0 else s4
    section = minutes[bos:eos] if bos >= 0 or eos >= 0 else ""
    pos = re.search("(일부|대부분의) 위원들?은", section, re.MULTILINE)
    bos = pos.start() if pos else -1
    section = section[bos:] if bos >= 0 else section
    section2, section2_txt = tidy_sentences(section)

    # 금융시장 동향
    bos = s3
    eos = s4
    section = minutes[bos:eos] if bos >= 0 or eos >= 0 else ""
    pos = re.search("(일부|대부분의) 위원들?은", section, re.MULTILINE)
    bos = pos.start() if pos else -1
    section = section[bos:] if bos >= 0 else section
    section3, section3_txt = tidy_sentences(section)

    # 통화정책방향
    bos = s4
    eos = s5 if s5 >= 0 else s6 if s6 >= 0 else s7
    section = minutes[bos:eos] if bos >= 0 or eos >= 0 else ""
    pos = re.search("(일부|대부분의) 위원들?은", section, re.MULTILINE)
    bos = pos.start() if pos else -1
    section = section[bos:] if bos >= 0 else section
    section4, section4_txt = tidy_sentences(section)

    # 위원별 의견 개진
    bos = s6
    eos = s7
    section = minutes[bos:eos] if bos >= 0 or eos >= 0 else ""
    pos = re.search("(일부|대부분의) 위원들?은", section, re.MULTILINE)
    bos = pos.start() if pos else -1
    section = section[bos:] if bos >= 0 else section
    section5, section5_txt = tidy_sentences(section)

    # 정부측 열석자 발언
    bos = s5
    eos = s6
    section = minutes[bos:eos] if bos >= 0 or eos >= 0 else ""
    pos = re.search("정부측 열석자 발언", section, re.MULTILINE)
    bos = pos.end() + 1 if pos else -1
    section = section[bos:] if bos >= 0 else section
    section6, section6_txt = tidy_sentences(section)

    sections = ["Economic Situation", "Foreign Currency", "Financial Markets",
                "Monetary Policy", "Participants’ Views", "Government’s View"]
    section_texts = (section1, section2, section3, section4, section5, section6)

    return sections, section_texts

In [None]:
def preprocessing(source_folder, output_file) :
    # 지정 폴더 내 파일 목록 조회 (파일만)
    txt_files = [f for f in listdir(source_folder) if isfile(join(source_folder, f))]
    txt_files.sort()
    df = pd.DataFrame(columns=["date", "minutes"])
    df["Economic Situation"] = ""
    df["Foreign Currency"] = ""
    df["Financial Markets"] = ""
    df["Monetary Policy"] = ""
    df["Participant Views"] = ""
    df["Government View"] = ""

    df["Economic Situation count"] = ""
    df["Foreign Currency count"] = ""
    df["Financial Markets count"] = ""
    df["Monetary Policy count"] = ""
    df["Participant Views count"] = ""
    df["Government View count"] = ""

    for txt_file in txt_files :
        try :
            with open(source_folder + txt_file, "r", encoding = "utf-8") as f :
                txt = f.read()
                sections, section_texts = preprocess_minutes(txt)

                df.loc[len(df)] = [txt_file.split("_")[0],
                                   txt,
                                   "@@@".join(section_texts[0]),
                                   "@@@".join(section_texts[1]),
                                   "@@@".join(section_texts[2]),
                                   "@@@".join(section_texts[3]),
                                   "@@@".join(section_texts[4]),
                                   "@@@".join(section_texts[5]),
                                   len(section_texts[0]),
                                   len(section_texts[1]),
                                   len(section_texts[2]),
                                   len(section_texts[3]),
                                   len(section_texts[4]),
                                   len(section_texts[5]),]
        except Exception as e:
            print("오류", e)
            pass

    df.to_csv(output_file, index = False, encoding = "utf-8", errors = "ignore")
    print("데이터프레임 구조 ", df.shape)
    
    return df

result = preprocessing(source_folder="../data/text/", output_file="../data/minutes_contents_df.csv")

In [None]:
result.info()

In [None]:
# 컨텐츠 DF에 날짜 컬럼 추가
result["num_date"] = result["date"].str.replace(r"\s+", "", regex=True).str.replace(r".txt", "").str.extract(r"\((\d{4}\.\d{1,2}\.\d{1,2})\.?\)$")
result["num_date"] = pd.to_datetime(result["num_date"], format="%Y.%m.%d")
# section 2, section 3만 분리
min_result = result.reindex(columns=["num_date", "date", "Foreign Currency", "Financial Markets"])
min_result

## DataFrame 병합

In [None]:
result_df = pd.merge(min_result[["num_date", "Foreign Currency", "Financial Markets"]], df[["date", "title"]], left_on = "num_date", right_on = "date", how = "inner")
result_df = result_df.reindex(columns=["date", "title", "Foreign Currency", "Financial Markets"])
result_df["contents"] = result_df.iloc[:, 2:].apply(lambda x: "@@@".join(x), axis=1)
result_df["contents"] = result_df["contents"].apply(lambda x: "" if x == "@@@" else x)
result_df

In [None]:
final_df = result_df.reindex(columns = ["date", "title", "contents"])
# date 타입 출력 포맷 변환
final_df["date"] = pd.to_datetime(final_df["date"]).dt.strftime("%Y.%m.%d")
final_df

In [None]:
final_df.to_csv("../data/minutes_separation_secion.csv", index = False)