In [2]:
import os
import pandas as pd
import io
import requests

# ==========================================
# 1. 설정
SOURCE_DIR = "BOK_Tokenized"
RAW_URL = "https://raw.githubusercontent.com/entelecheia/eKoNLPy/refs/heads/master/src/ekonlpy/data/lexicon/mpko/mp_polarity_lexicon_mkt.csv"
# ==========================================

print(f">>> 점수 계산 중.\n")

# 1. 사전 가져오기
try:
    response = requests.get(RAW_URL)
    response.raise_for_status()
    csv_data = response.content.decode('utf-8')
    df_dict = pd.read_csv(io.StringIO(csv_data))
    print(f"사전 로딩 성공. ({len(df_dict)}개 단어)")
except Exception as e:
    print(f"다운로드 실패: {e}")
    df_dict = pd.DataFrame()

# 2. 사전 메모리 로딩
sentiment_dict = {}

if not df_dict.empty:
    word_col = 'word' if 'word' in df_dict.columns else 'ngram'
    score_col = 'polarity' if 'polarity' in df_dict.columns else 'score'

    for index, row in df_dict.iterrows():
        term = str(row[word_col]) # 태그 유지
        score = row[score_col]
        
        if score > 0:
            sentiment_dict[term] = 1
        elif score < 0:
            sentiment_dict[term] = -1

print(f"사전 준비 완료: {len(sentiment_dict)}개")


# =======================================================
# Max Match 점수 계산 함수
# =======================================================
def calculate_tone_clean(tokens, lexicon, max_n=5):
    doc_length = len(tokens)
    used_mask = [False] * doc_length
    
    hawk_count = 0
    dove_count = 0
    matched_tokens = [] 
    
    for n in range(max_n, 0, -1):
        for i in range(doc_length - n + 1):
            if any(used_mask[i : i+n]):
                continue
            
            chunk = tokens[i : i+n]
            ngram_term = ";".join(chunk)
            
            if ngram_term in lexicon:
                polarity = lexicon[ngram_term]
                
                if polarity == 1:
                    hawk_count += 1
                    matched_tokens.append(ngram_term)
                elif polarity == -1:
                    dove_count += 1
                    matched_tokens.append(ngram_term)
                
                for k in range(n):
                    used_mask[i + k] = True
                    
    return hawk_count, dove_count, matched_tokens

# =======================================================
# 4. 파일 분석 및 엑셀 저장
# =======================================================
results = []
if os.path.exists(SOURCE_DIR):
    files = os.listdir(SOURCE_DIR)
    files.sort()

    count = 0
    for filename in files:
        if not filename.lower().endswith(".txt"):
            continue

        date_str = filename.replace(".txt", "")
        try:
            date_formatted = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
        except:
            date_formatted = date_str

        with open(os.path.join(SOURCE_DIR, filename), "r", encoding="utf-8") as f:
            content = f.read()
            raw_tokens = content.split(",")
            raw_tokens = [t for t in raw_tokens if t.strip()]

        h_cnt, d_cnt, evidence = calculate_tone_clean(raw_tokens, sentiment_dict, max_n=5)

        total = h_cnt + d_cnt
        if total > 0:
            tone_score = (h_cnt - d_cnt) / total
        else:
            tone_score = 0
        
        evidence_str = " | ".join(evidence)

        results.append({
            "날짜": date_formatted,
            "매파_단어": h_cnt,
            "비둘기_단어": d_cnt,
            "어조점수": tone_score,
            "근거_단어": evidence_str 
        })
        
        count += 1
        if count % 10 == 0:
            print(".", end="", flush=True)

    # 엑셀로 저장
    df = pd.DataFrame(results)
    df = df.sort_values(by="날짜")
    
    save_name = "BOK 회의록 어조 점수.xlsx"
    
    try:
        df.to_excel(save_name, index=False, engine='openpyxl')
        print(f"\n\n>>> 완료. '{save_name}' 파일이 생성되었습니다.")
    except:
        pass
else:
    print("폴더가 없습니다!")

>>> 점수 계산 중.

사전 로딩 성공. (38786개 단어)
사전 준비 완료: 38783개
...............

>>> 완료. 'BOK 회의록 어조 점수.xlsx' 파일이 생성되었습니다.
