# 뉴스 수집 -> 번역 -> 감정분석 

중간에 전처리 과정이 들어가면 좋을 것 같기도 합니다.

In [None]:
# pip install googletrans==4.0.0-rc1

In [1]:
#!/usr/bin/env python3
import urllib.request
import urllib.parse
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import html
import config

# 하루에 몇 건만 가져올지 설정
MAX_NEWS = 50

def crawl_links_and_dates_titles_for_keyword(keyword: str):
    """
    Naver News Open API를 호출해서
      - 링크(link)
      - 발행일(pubDate)
      - 제목(title)
    를 최대 MAX_NEWS개수만큼 수집합니다.
    """
    query = urllib.parse.quote(keyword)
    display = 100
    start   = 1
    links, dates, titles = [], [], []

    while len(links) < MAX_NEWS:
        api_url = (
            f"https://openapi.naver.com/v1/search/news?"
            f"query={query}&display={display}&start={start}&sort=sim"
        )
        req = urllib.request.Request(api_url)
        req.add_header("X-Naver-Client-Id",     config.NAVER_CLIENT_ID)
        req.add_header("X-Naver-Client-Secret", config.NAVER_CLIENT_SECRET)

        with urllib.request.urlopen(req) as res:
            if res.getcode() != 200:
                break
            data = json.loads(res.read().decode("utf-8"))

        for item in data.get("items", []):
            link    = item.get("link", "")
            pubdate = item.get("pubDate", "")
            raw_title = item.get("title", "")
            clean_title = html.unescape(re.sub(r"</?b>", "", raw_title))
            # (선택) 특정 도메인만 걸러내지 않으려면 이 if 문 제거
            if "news.naver.com" in link:
                links.append(link)
                dates.append(pubdate)
                titles.append(clean_title)
                if len(links) >= MAX_NEWS:
                    break

        start += display
        if start > 1000:
            break

    return links, dates, titles

def fetch_content(link: str) -> str:
    """
    단일 뉴스 페이지로 접속하여
    <div id="newsct_article"> 내의 텍스트만 추출합니다.
    """
    try:
        resp = requests.get(link, timeout=5)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        article = soup.find("div", id="newsct_article")
        if not article:
            return ""
        for tag in article(["script", "style"]):
            tag.decompose()
        return article.get_text(strip=True)
    except Exception:
        return ""

def crawl_news_for_keyword(keyword: str) -> pd.DataFrame:
    """
    keyword로 검색한 링크·날짜·제목을 가져오고,
    각 링크에서 본문을 크롤링하여 DataFrame으로 반환합니다.
    """
    links, dates, titles = crawl_links_and_dates_titles_for_keyword(keyword)
    records = []
    for link, date, title in zip(links, dates, titles):
        content = fetch_content(link)
        records.append({
            "link":    link,
            "pubDate": date,
            "title":   title,
            "content": content
        })
    return pd.DataFrame(records)

if __name__ == "__main__":
    # Apple 관련 뉴스만 가져와서 apple 데이터프레임으로!
    apple = crawl_news_for_keyword("애플")
    print(apple)
    # 필요하다면 CSV로 저장
    apple.to_csv("apple_naver_news.csv", index=False)

                                                 link  \
0   https://n.news.naver.com/mnews/article/448/000...   
1   https://n.news.naver.com/mnews/article/009/000...   
2   https://n.news.naver.com/mnews/article/015/000...   
3   https://n.news.naver.com/mnews/article/215/000...   
4   https://n.news.naver.com/mnews/article/277/000...   
5   https://n.news.naver.com/mnews/article/003/001...   
6   https://n.news.naver.com/mnews/article/001/001...   
7   https://n.news.naver.com/mnews/article/056/001...   
8   https://n.news.naver.com/mnews/article/055/000...   
9   https://n.news.naver.com/mnews/article/023/000...   
10  https://n.news.naver.com/mnews/article/005/000...   
11  https://n.news.naver.com/mnews/article/011/000...   
12  https://n.news.naver.com/mnews/article/009/000...   
13  https://n.news.naver.com/mnews/article/277/000...   
14  https://n.news.naver.com/mnews/article/032/000...   
15  https://n.news.naver.com/mnews/article/008/000...   
16  https://n.news.naver.com/mn

In [2]:
apple.head()

Unnamed: 0,link,pubDate,title,content
0,https://n.news.naver.com/mnews/article/448/000...,"Tue, 27 May 2025 17:08:00 +0900","트럼프 ""애플도 관세 25%""…중동순방 동행 거절한 팀 쿡에 '뒤끝'",도널드 트럼프 미국 대통령 /REUTERS=연합뉴스팀 쿡 애플 최고경영자(CEO)가...
1,https://n.news.naver.com/mnews/article/009/000...,"Tue, 27 May 2025 13:46:00 +0900",트럼프 러브콜에 팀쿡은 ‘거절’…애플이 미운털 박힌 사연은,지난 2019년 도널드 트럼프 미국 대통령(오른쪽)과 팀 쿡 애플 CEO가 텍사스의...
2,https://n.news.naver.com/mnews/article/015/000...,"Tue, 27 May 2025 07:29:00 +0900","""팀 쿡은 여기 없네""…트럼프에 '미운 털' 박힌 애플 '주르륵'","트럼프 미움 산 '애플' 팀 쿡…주가 올들어 20% 급락""지난 8년간 트럼프 사랑 ..."
3,https://n.news.naver.com/mnews/article/215/000...,"Tue, 27 May 2025 07:29:00 +0900",트럼프에게 미움 사면 안되는데…애플 '어쩌나',최근 도널드 트럼프 미국 대통령이 팀 쿡 애플 최고경영자(CEO)에게 중동 순방길 ...
4,https://n.news.naver.com/mnews/article/277/000...,"Tue, 27 May 2025 08:55:00 +0900","트럼프, 애플 괴롭히는 이유…""팀 쿡이 중동 안 따라 갔대""","NYT ""팀 쿡, 중동 순방 동행 거절""""트럼프, 화나서 비난 메시지 내""""애플, ..."


In [3]:
from googletrans import Translator

translator = Translator()

def translate_text(korean):
    if not isinstance(korean, str) or not korean.strip():
        return ""
    try:
        return translator.translate(korean, src='ko', dest='en').text
    except Exception:
        return ""

# Pandas의 apply 이용
apple["content_en"] = apple["content"].apply(translate_text)

apple.to_csv("apple_naver_news.csv", index=False)

In [4]:
#!/usr/bin/env python3

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1. FinBERT 모델 로드
MODEL = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, return_all_scores=True)

# 2. 데이터 로드
df = pd.read_csv("apple_naver_news.csv")
texts = df["content_en"].fillna("").tolist()

# 3. 감정 분석 수행
def truncate_text(text, max_length=512):
    return text[:max_length]

# 텍스트를 자르고 감정 분석 수행
texts = df["content_en"].fillna("").apply(truncate_text).tolist()
results = finbert(texts, batch_size=8)

# 4. 점수 추출 및 컬럼 추가
pos_scores, neu_scores, neg_scores = [], [], []

for res in results:
    d = {r["label"].lower(): r["score"] for r in res}
    pos_scores.append(d.get("positive", 0.0))
    neu_scores.append(d.get("neutral", 0.0))
    neg_scores.append(d.get("negative", 0.0))

df["finbert_positive"] = pos_scores
df["finbert_neutral"] = neu_scores
df["finbert_negative"] = neg_scores

# 5. 결과 저장
output_csv = "apple_finbert_sentiment.csv"
df.to_csv(output_csv, index=False)
print(f"✅ FinBERT 감정분석 결과를 '{output_csv}'에 저장했습니다.")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


✅ FinBERT 감정분석 결과를 'apple_finbert_sentiment.csv'에 저장했습니다.


In [5]:
# 6. 결과 확인
print(df[["finbert_positive", "finbert_neutral", "finbert_negative"]].head(20))

    finbert_positive  finbert_neutral  finbert_negative
0           0.000026         0.980943          0.019031
1           0.000028         0.891751          0.108221
2           0.000092         0.018345          0.981563
3           0.000321         0.740987          0.258691
4           0.000005         0.999855          0.000140
5           0.003147         0.979716          0.017137
6           0.000363         0.869794          0.129843
7           0.000076         0.951785          0.048139
8           0.000065         0.989451          0.010484
9           0.000257         0.099182          0.900561
10          0.000170         0.287850          0.711980
11          0.000004         0.000170          0.999826
12          0.000017         0.955409          0.044574
13          0.000111         0.861230          0.138659
14          0.000002         0.999533          0.000465
15          0.000302         0.996300          0.003398
16          0.000036         0.999373          0

In [1]:
# 제목만 감정분석한 것.
#!/usr/bin/env python3

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# 1. FinBERT 모델 로드
MODEL = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, return_all_scores=True)

# 2. 데이터 로드
df = pd.read_csv("apple_google_rss_1year.csv")
texts = df["title"].fillna("").tolist()

# 3. 감정 분석 수행
results = finbert(texts, batch_size=8)

# 4. 점수 추출 및 컬럼 추가
pos_scores, neu_scores, neg_scores = [], [], []

for res in results:
    d = {r["label"].lower(): r["score"] for r in res}
    pos_scores.append(d.get("positive", 0.0))
    neu_scores.append(d.get("neutral", 0.0))
    neg_scores.append(d.get("negative", 0.0))

df["finbert_positive"] = pos_scores
df["finbert_neutral"] = neu_scores
df["finbert_negative"] = neg_scores

# 5. 결과 저장
output_csv = "apple_finbert_sentiment.csv"
df.to_csv(output_csv, index=False)
print(f"✅ FinBERT 감정분석 결과를 '{output_csv}'에 저장했습니다.")


  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


✅ FinBERT 감정분석 결과를 'apple_finbert_sentiment.csv'에 저장했습니다.
