In [None]:
import os
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
# 0. ÌååÏùº Í≤ΩÎ°ú ÏÑ§Ï†ï
input_data_file = "df_master_TM_proj_lemma.csv"
market_json_file = "market_lexicon.json"
dictionary_json_file = "sentiment_lexicon.json"
output_file = "final_tone_analysis_result.csv"

In [27]:
# 1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú
print(">>> 1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Ï§ë...")
df = pd.read_csv(
    input_data_file,
    engine='python',
    on_bad_lines='skip'
)
df = df.dropna(subset=['cleaned_text_lemma', 'doc_id'])
print(f"   - Î∂ÑÏÑù ÎåÄÏÉÅ Î¨∏Ïû•(Ìñâ) Í∞úÏàò: {len(df)}Í∞ú")


>>> 1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Ï§ë...
   - Î∂ÑÏÑù ÎåÄÏÉÅ Î¨∏Ïû•(Ìñâ) Í∞úÏàò: 171788Í∞ú


In [28]:
# 2. Î¨∏Ïû• ÌÜ§ Í≥ÑÏÇ∞ Ìï®Ïàò (ÏãúÏû•/ÏÇ¨Ï†Ñ Í≥µÏö©)

def calculate_sentence_tone(df_input, vocab, hawkish_w, dovish_w):
    # 1. Î≤°ÌÑ∞Ìôî (Î¨∏Ïû•Î≥Ñ Îã®Ïñ¥ ÎπàÎèÑ Í≥ÑÏÇ∞)
    vectorizer = CountVectorizer(vocabulary=vocab, ngram_range=(1, 5))
    dtm = vectorizer.transform(df_input['cleaned_text_lemma'])

    # 2. Î¨∏Ïû•Î≥Ñ Ï†êÏàò Ìï©ÏÇ∞ (ÎπàÎèÑ * Í∞ÄÏ§ëÏπò)
    sent_h_score = dtm.dot(hawkish_w)
    sent_d_score = dtm.dot(dovish_w)
    total_score = sent_h_score + sent_d_score

    # 3. Î¨∏Ïû• ÌÜ§ Í≥ÑÏÇ∞
    with np.errstate(divide='ignore', invalid='ignore'):
        sent_tones = (sent_h_score - sent_d_score) / total_score
    return np.nan_to_num(sent_tones, 0) # NaN -> 0

In [29]:
# 3. [ÏãúÏû• Ï†ëÍ∑ºÎ≤ï] Î¨∏Ïû• -> Î¨∏ÏÑú Ïñ¥Ï°∞ Í≥ÑÏÇ∞

print("\n>>> 3. ÏãúÏû• Ï†ëÍ∑ºÎ≤ï(Market Approach) Í≥ÑÏÇ∞ ÏãúÏûë...")

if os.path.exists(market_json_file):
    # (1) JSON Î°úÎìú
    with open(market_json_file, "r", encoding="utf-8") as f:
        market_lexicon = json.load(f)

    # (2) Îã®Ïñ¥Ïû• Î∞è Í∞ÄÏ§ëÏπò Ï†ïÎ†¨ (Vectorizer ÏàúÏÑúÏóê ÎßûÏ∂îÍ∏∞ ÏúÑÌï¥ ÌïÑÏàò)
    m_vocab_raw = list(market_lexicon.keys())

    # ÏàúÏÑú Î≥¥Ïû•ÏùÑ ÏúÑÌï¥ ÏûÑÏãú Î≤°ÌÑ∞ÎùºÏù¥Ï†Ä ÏÉùÏÑ±
    temp_vec = CountVectorizer(vocabulary=m_vocab_raw, ngram_range=(1, 5))
    feature_names = temp_vec.get_feature_names_out()

    m_h_weights = []
    m_d_weights = []

    for word in feature_names:
        info = market_lexicon.get(word, {})
        pol = info.get('polarity')
        score = info.get('score', 0)

        if pol == 'hawkish':
            m_h_weights.append(score)
            m_d_weights.append(0)
        elif pol == 'dovish':
            m_h_weights.append(0)
            m_d_weights.append(score)
        else:
            m_h_weights.append(0)
            m_d_weights.append(0)

    # (3) Î¨∏Ïû• ÌÜ§ Í≥ÑÏÇ∞ Ìï®Ïàò Ìò∏Ï∂ú
    # (feature_namesÍ∞Ä Ï†ïÎ†¨Îêú Îã®Ïñ¥Ïû•Ïù¥ÎØÄÎ°ú vocab ÏûêÎ¶¨Ïóê ÎÑ£ÏäµÎãàÎã§)
    df['Sent_Tone_Market'] = calculate_sentence_tone(
        df, feature_names, np.array(m_h_weights), np.array(m_d_weights)
    )

    # (4) Î¨∏Ïû• ÎùºÎ≤®ÎßÅ (1: Îß§Ìåå, -1: ÎπÑÎëòÍ∏∞, 0: Ï§ëÎ¶Ω)
    df['Sent_Label_Market'] = np.sign(df['Sent_Tone_Market'])

    print("   ‚úÖ ÏãúÏû• Ï†ëÍ∑ºÎ≤ï Î¨∏Ïû• Î∂ÑÏÑù ÏôÑÎ£å")

else:
    print("   ‚ö†Ô∏è ÏãúÏû• Ï†ëÍ∑ºÎ≤ï ÏÇ¨Ï†Ñ ÌååÏùºÏù¥ ÏóÜÏñ¥ 0ÏúºÎ°ú Ï≤òÎ¶¨Ìï©ÎãàÎã§.")
    df['Sent_Tone_Market'] = 0
    df['Sent_Label_Market'] = 0


>>> 3. ÏãúÏû• Ï†ëÍ∑ºÎ≤ï(Market Approach) Í≥ÑÏÇ∞ ÏãúÏûë...
   ‚úÖ ÏãúÏû• Ï†ëÍ∑ºÎ≤ï Î¨∏Ïû• Î∂ÑÏÑù ÏôÑÎ£å


In [32]:
print("\n>>> 4. ÏÇ¨Ï†Ñ Ï†ëÍ∑ºÎ≤ï(Dictionary Approach) Í≥ÑÏÇ∞ ÏãúÏûë...")

if os.path.exists(dictionary_json_file):
    # (1) JSON Î°úÎìú (ÏãúÏû• Ï†ëÍ∑ºÎ≤ïÍ≥º ÎèôÏùºÌïòÍ≤å Î≥ÄÍ≤Ω)
    with open(dictionary_json_file, "r", encoding="utf-8") as f:
        dict_map = json.load(f)

    print(f"   - ÏÇ¨Ï†Ñ Î°úÎìú ÏôÑÎ£å! Ìè¨Ìï®Îêú Îã®Ïñ¥ Ïàò: {len(dict_map)}Í∞ú")

    # (2) Í∞ÄÏ§ëÏπò Î∞∞Ïó¥ ÏÉùÏÑ±
    # JSON Íµ¨Ï°∞Í∞Ä {"word": {"polarity": "hawkish", "score": 1.5}, ...} ÎùºÍ≥† Í∞ÄÏ†ï
    d_vocab_raw = list(dict_map.keys())

    # ÏàúÏÑú Î≥¥Ïû•ÏùÑ ÏúÑÌï¥ ÏûÑÏãú Î≤°ÌÑ∞ÎùºÏù¥Ï†Ä ÏÉùÏÑ±
    temp_vec_d = CountVectorizer(vocabulary=d_vocab_raw, ngram_range=(1, 5))
    feat_d = temp_vec_d.get_feature_names_out()

    d_h_weights = []
    d_d_weights = []

    for word in feat_d:
        info = dict_map.get(word, {})

        # ÌÇ§ Ïù¥Î¶ÑÏù¥ ÏãúÏû• Ï†ëÍ∑ºÎ≤ïÍ≥º Í∞ôÎã§Î©¥ 'polarity', Îã§Î•¥Îã§Î©¥ 'label' Îì±ÏúºÎ°ú ÏàòÏ†ï ÌïÑÏöî
        p = info.get('polarity', '') # ÌòπÏùÄ info.get('label', '')
        s = info.get('score', 1.0)   # Ï†êÏàòÍ∞Ä ÏóÜÏúºÎ©¥ Í∏∞Î≥∏Í∞í 1.0

        if 'hawkish' in p.lower():
            d_h_weights.append(s)
            d_d_weights.append(0)
        elif 'dovish' in p.lower():
            d_h_weights.append(0)
            d_d_weights.append(s)
        else:
            d_h_weights.append(0)
            d_d_weights.append(0)

    # (3) Î¨∏Ïû• ÌÜ§ Í≥ÑÏÇ∞ (Ìï®ÏàòÎäî ÏúÑÏóêÏÑú Ï†ïÏùòÌïú Í≤É Ïû¨ÏÇ¨Ïö©)
    df['Sent_Tone_Dict'] = calculate_sentence_tone(
        df, feat_d, np.array(d_h_weights), np.array(d_d_weights)
    )
    df['Sent_Label_Dict'] = np.sign(df['Sent_Tone_Dict'])
    print("   ‚úÖ ÏÇ¨Ï†Ñ Ï†ëÍ∑ºÎ≤ï Î¨∏Ïû• Î∂ÑÏÑù ÏôÑÎ£å")

else:
    print(f"   ‚ö†Ô∏è '{dictionary_json_file}' ÌååÏùºÏù¥ ÏóÜÏñ¥ ÏÇ¨Ï†Ñ Ï†ëÍ∑ºÎ≤ïÏùÄ 0ÏúºÎ°ú Ï≤òÎ¶¨Ìï©ÎãàÎã§.")
    df['Sent_Tone_Dict'] = 0
    df['Sent_Label_Dict'] = 0


>>> 4. ÏÇ¨Ï†Ñ Ï†ëÍ∑ºÎ≤ï(Dictionary Approach) Í≥ÑÏÇ∞ ÏãúÏûë...
   - ÏÇ¨Ï†Ñ Î°úÎìú ÏôÑÎ£å! Ìè¨Ìï®Îêú Îã®Ïñ¥ Ïàò: 15000Í∞ú
   ‚úÖ ÏÇ¨Ï†Ñ Ï†ëÍ∑ºÎ≤ï Î¨∏Ïû• Î∂ÑÏÑù ÏôÑÎ£å


In [34]:
# 5. Î¨∏ÏÑú Îã®ÏúÑ ÏßëÍ≥Ñ Î∞è Ï†ÄÏû•

print("\n>>> 5. Î¨∏ÏÑúÎ≥Ñ ÏµúÏ¢Ö ÌÜ§ ÏßëÍ≥Ñ Ï§ë...")

# ÏßëÍ≥Ñ Ìï®Ïàò: (Îß§Ìåå Î¨∏Ïû• Ïàò - ÎπÑÎëòÍ∏∞ Î¨∏Ïû• Ïàò) / (Îß§Ìåå Î¨∏Ïû• Ïàò + ÎπÑÎëòÍ∏∞ Î¨∏Ïû• Ïàò)
def aggregate_tone(label_series):
    h_count = (label_series > 0).sum()
    d_count = (label_series < 0).sum()
    total = h_count + d_count

    if total == 0: return 0
    return (h_count - d_count) / total

# doc_id Í∏∞Ï§Ä Í∑∏Î£πÌôî
final_df = df.groupby('doc_id').agg({
    'Sent_Label_Market': aggregate_tone,
    'Sent_Label_Dict': aggregate_tone
}).rename(columns={
    'Sent_Label_Market': 'Market_Tone',
    'Sent_Label_Dict': 'Dictionary_Tone'
}).reset_index()

# Ï†ÄÏû•
final_df.to_csv(output_file, index=False)

print(f"\n=======================================================")
print(f"üéâ Î∂ÑÏÑù ÏôÑÎ£å! Í≤∞Í≥º ÌååÏùº: {output_file}")
print(f"=======================================================")
print(final_df.head())


>>> 5. Î¨∏ÏÑúÎ≥Ñ ÏµúÏ¢Ö ÌÜ§ ÏßëÍ≥Ñ Ï§ë...

üéâ Î∂ÑÏÑù ÏôÑÎ£å! Í≤∞Í≥º ÌååÏùº: final_tone_analysis_result.csv
          doc_id  Market_Tone  Dictionary_Tone
0  FOMC_20130918     0.392405        -0.975904
1  FOMC_20131030     0.466667        -0.914062
2  FOMC_20131218     0.328358        -0.842294
3  FOMC_20140129     0.596730        -0.907455
4  FOMC_20140319     0.455224        -0.839416
