In [None]:
# 🔹 필요 라이브러리 import
import os
import time
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import schedule
from bs4 import BeautifulSoup

# 🔹 저장할 디렉토리
SAVE_DIR = r"C:\Users\SMHRD\Desktop\실전\ESC\fastAPI\crewling\date"

# 🔹 뉴스 분류 함수 (고용/산재)
def assign_label(row):
    content = (row['title'] or '') + ' ' + (row['content'] or '')
    if any(keyword in content for keyword in ['산재', '산업재해', '직업병', '요양', '공상', '업무상질병', '근골격계']):
        return '산재'
    elif any(keyword in content for keyword in ['고용', '노동', '근로', '일자리', '임금', '해고', '체불', '피해', '계약', '부당', '신고']):
        return '고용'
    else:
        return '기타'

# 🔹 Top5 뉴스 추출 함수
def extract_top5(news_sample):
    if news_sample.empty:
        return pd.DataFrame()

    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(news_sample['content'])
    kmeans = KMeans(n_clusters=min(20, len(news_sample)), random_state=42)
    news_sample['cluster'] = kmeans.fit_predict(X)

    centroids = kmeans.cluster_centers_
    closest_docs = []
    for i in range(kmeans.n_clusters):
        cluster_indices = np.where(news_sample['cluster'] == i)[0]
        if len(cluster_indices) == 0:
            continue
        cluster_vectors = X[cluster_indices]
        distances = np.linalg.norm(cluster_vectors - centroids[i], axis=1)
        closest_doc_index = cluster_indices[np.argmin(distances)]
        closest_docs.append(closest_doc_index)

    issue_top_df = news_sample.iloc[closest_docs]

    titles = issue_top_df['title'].fillna('').tolist()
    title_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.85, min_df=2)
    tfidf_matrix = title_vectorizer.fit_transform(titles)
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    similarity_score = cosine_sim.sum(axis=1)
    top_indices = similarity_score.argsort()[::-1][:5]
    auto_top5_df = issue_top_df.iloc[top_indices]

    return auto_top5_df

# 🔹 메인 실행 함수
def run_job():
    today = datetime.today()
    three_months_ago = today - timedelta(days=90)
    start_date = three_months_ago.strftime('%Y%m%d')
    end_date = today.strftime('%Y%m%d')

    print(f"📅 자동 설정된 수집 기간: {start_date} ~ {end_date}")

    keyword = '((고용 || 근로 || 노동 || 일자리) && (임금 || 해고 || 체불 || 피해 || 계약 || 부당 || 신고)) || (산재 || 산업재해 || 직업병 || 요양 || 공상 || 업무상질병 || 근골격계)'
    urlString = 'http://qt.some.co.kr/TrendMap/JSON/ServiceHandler'

    doc_list = []
    for pageNum in range(1, 2):  # 테스트용 1페이지 (실전은 31로 변경 가능)
        params = {
            'keyword': keyword,
            'startDate': start_date,
            'endDate': end_date,
            'source': 'news ',
            'lang': 'ko',
            'rowPerPage': '500',
            'pageNum': pageNum,
            'orderType': '1',
            'command': 'GetKeywordDocuments'
        }
        try:
            response = requests.post(urlString, data=params)
            response.raise_for_status()

            if 'item' in response.json():
                doc_list += response.json()['item']['documentList']
            else:
                print(f"⚠️ 페이지 {pageNum} 응답에 'item' 없음")
        except Exception as e:
            print(f"❌ 페이지 {pageNum} 수집 실패: {e}")
            continue

        time.sleep(2)

    print("Total Count:", response.json()['item']['totalCnt'])
    print("Collect Data Count:", len(doc_list))

    df = pd.DataFrame(doc_list)[['date', 'writerName', 'title', 'content', 'url', 'vks']]

    # 🔹 고용/산재 자동 분류
    df['label'] = df.apply(assign_label, axis=1)
    df = df[df['label'] != '기타']  # 기타 제거

    # 🔹 고용/산재 각각 추출
    employment_news = df[df['label'] == '고용'].copy()
    industrial_accident_news = df[df['label'] == '산재'].copy()

    # 🔹 샘플링
    employment_news_sample = employment_news.sample(n=min(500, len(employment_news)), random_state=42)
    industrial_accident_news_sample = industrial_accident_news.sample(n=min(500, len(industrial_accident_news)), random_state=42)

    # 🔹 Top5 추출
    top5_employment = extract_top5(employment_news_sample)
    top5_industrial = extract_top5(industrial_accident_news_sample)

    # 🔹 고용+산재 5+5 합치기
    top10_df = pd.concat([top5_employment, top5_industrial], ignore_index=True)

    # 🔹 필요한 컬럼만
    final_df = top10_df[['date', 'writerName', 'title', 'content', 'url']]

    image_urls = []
    for url in final_df['url']:
        try:
            headers = {'User-Agent': 'Mozilla/5.0'}
            res = requests.get(url, headers=headers, timeout=10)
            res.raise_for_status()
            soup = BeautifulSoup(res.text, 'html.parser')

            og_image = soup.find('meta', property='og:image')
            if og_image and og_image.get('content'):
                image_url = og_image['content']
            else:
                image_url = None
        except Exception as e:
            print(f"❌ URL 에러: {url} ({e})")
            image_url = None

        image_urls.append(image_url)

    final_df['image_url'] = image_urls

    # 🔹 최종 저장
    today_str = datetime.today().strftime('%Y%m%d')
    os.makedirs(SAVE_DIR, exist_ok=True)
    final_file_path = os.path.join(SAVE_DIR, f"auto_top10_통합_{today_str}.csv")
    final_df.to_csv(final_file_path, index=False, encoding='utf-8-sig')

    print(f"📁 CSV 저장 완료: {final_file_path}")

    try:
        response = requests.post(
        "http://localhost:8087/api/news/save-csv",
        json={"filePath": final_file_path}
        )
        print(f"🚀 서버 응답: {response.text}")
    except Exception as e:
        print(f"❌ Spring 서버에 요청 실패: {e}")

    # 🔹 실행 + 스케줄 설정
run_job()

schedule.every().day.at("09:00").do(run_job)

print("스케줄러가 작동 중입니다... (Ctrl+C로 종료)")

while True:
    schedule.run_pending()
    time.sleep(1)



📅 자동 설정된 수집 기간: 20250128 ~ 20250428
Total Count: 31502
Collect Data Count: 500


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['image_url'] = image_urls


📁 CSV 저장 완료: C:\Users\SMHRD\Desktop\실전\ESC\fastAPI\crewling\date\auto_top10_통합_20250428.csv
🚀 서버 응답: ✅ CSV 데이터가 저장되었습니다!
스케줄러가 작동 중입니다... (Ctrl+C로 종료)
