In [None]:
'http://qt.some.co.kr/TrendMap/JSON/ServiceHandler'

In [29]:
# 🔹 필요 라이브러리 import
import os
import time
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import schedule
from bs4 import BeautifulSoup

# 🔹 저장할 디렉토리
SAVE_DIR = r"C:\Users\SMHRD\Desktop\실전\ESC\fastAPI\crewling\date"

# 🔹 뉴스 분류 함수 (고용/산재)
def assign_label(row):
    content = (row['title'] or '') + ' ' + (row['content'] or '')
    if any(keyword in content for keyword in ['산재', '산업재해', '직업병', '요양', '공상', '업무상질병', '근골격계']):
        return '산재'
    elif any(keyword in content for keyword in ['고용', '노동', '근로', '일자리', '임금', '해고', '체불', '피해', '계약', '부당', '신고']):
        return '고용'
    else:
        return '기타'

# 🔹 Top5 뉴스 추출 함수
def extract_top5(news_sample):
    if news_sample.empty:
        return pd.DataFrame()

    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(news_sample['content'])
    kmeans = KMeans(n_clusters=min(20, len(news_sample)), random_state=42)
    news_sample['cluster'] = kmeans.fit_predict(X)

    centroids = kmeans.cluster_centers_
    closest_docs = []
    for i in range(kmeans.n_clusters):
        cluster_indices = np.where(news_sample['cluster'] == i)[0]
        if len(cluster_indices) == 0:
            continue
        cluster_vectors = X[cluster_indices]
        distances = np.linalg.norm(cluster_vectors - centroids[i], axis=1)
        closest_doc_index = cluster_indices[np.argmin(distances)]
        closest_docs.append(closest_doc_index)

    issue_top_df = news_sample.iloc[closest_docs]

    titles = issue_top_df['title'].fillna('').tolist()
    title_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.85, min_df=2)
    tfidf_matrix = title_vectorizer.fit_transform(titles)
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    similarity_score = cosine_sim.sum(axis=1)
    top_indices = similarity_score.argsort()[::-1][:5]
    auto_top5_df = issue_top_df.iloc[top_indices]

    return auto_top5_df

# 🔹 메인 실행 함수
def run_job():
    today = datetime.today()
    three_months_ago = today - timedelta(days=90)
    start_date = three_months_ago.strftime('%Y%m%d')
    end_date = today.strftime('%Y%m%d')

    print(f"📅 자동 설정된 수집 기간: {start_date} ~ {end_date}")

    keyword = '((고용 || 근로 || 노동 || 일자리) && (임금 || 해고 || 체불 || 피해 || 계약 || 부당 || 신고)) || (산재 || 산업재해 || 직업병 || 요양 || 공상 || 업무상질병 || 근골격계)'
    urlString = '  '
    doc_list = []
    for pageNum in range(1, 2):  # 테스트용 1페이지 (실전은 31로 변경 가능)
        params = {
            'keyword': keyword,
            'startDate': start_date,
            'endDate': end_date,
            'source': 'news ',
            'lang': 'ko',
            'rowPerPage': '500',
            'pageNum': pageNum,
            'orderType': '1',
            'command': 'GetKeywordDocuments'
        }
        try:
            response = requests.post(urlString, data=params)
            response.raise_for_status()

            if 'item' in response.json():
                doc_list += response.json()['item']['documentList']
            else:
                print(f"⚠️ 페이지 {pageNum} 응답에 'item' 없음")
        except Exception as e:
            print(f"❌ 페이지 {pageNum} 수집 실패: {e}")
            continue

        time.sleep(2)

    print("Total Count:", response.json()['item']['totalCnt'])
    print("Collect Data Count:", len(doc_list))

    final_df = pd.DataFrame(doc_list)[['date', 'writerName', 'title', 'content', 'url']]
 # 🔹 이미지 URL 수집
    image_urls = []
    for url in final_df['url']:
        try:
            headers = {'User-Agent': 'Mozilla/5.0'}
            res = requests.get(url, headers=headers, timeout=10)
            res.raise_for_status()
            soup = BeautifulSoup(res.text, 'html.parser')

            og_image = soup.find('meta', property='og:image')
            if og_image and og_image.get('content'):
                image_url = og_image['content']
            else:
                image_url = None
        except Exception as e:
            print(f"❌ URL 에러: {url} ({e})")
            image_url = None

        image_urls.append(image_url)

    final_df['image_url'] = image_urls

    # 🔹 최종 저장
    today_str = datetime.today().strftime('%Y%m%d')
    os.makedirs(SAVE_DIR, exist_ok=True)
    final_file_path = os.path.join(SAVE_DIR, f"auto_top10_통합_{today_str}.csv")
    final_df.to_csv(final_file_path, index=False, encoding='utf-8-sig')

    print(f"📁 CSV 저장 완료: {final_file_path}")

    try:
        response = requests.post(
        "http://localhost:8087/api/news/save-csv",
        json={"filePath": final_file_path}
        )
        print(f"🚀 서버 응답: {response.text}")
    except Exception as e:
        print(f"❌ Spring 서버에 요청 실패: {e}")

    # 🔹 실행 + 스케줄 설정
run_job()

schedule.every().day.at("09:00").do(run_job)

print("스케줄러가 작동 중입니다... (Ctrl+C로 종료)")

while True:
    schedule.run_pending()
    time.sleep(1)



📅 자동 설정된 수집 기간: 20250128 ~ 20250428
Total Count: 31502
Collect Data Count: 500


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['image_url'] = image_urls


📁 CSV 저장 완료: C:\Users\SMHRD\Desktop\실전\ESC\fastAPI\crewling\date\auto_top10_통합_20250428.csv
🚀 서버 응답: ✅ CSV 데이터가 저장되었습니다!
스케줄러가 작동 중입니다... (Ctrl+C로 종료)


KeyboardInterrupt: 

In [None]:
import os
import re
import time
import schedule
import requests
import pandas as pd
import tomotopy as tp
from kiwipiepy import Kiwi
# from konlpy.tag import Mecab
from collections import Counter
from datetime import datetime, timedelta
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# [수집] → [명사추출] → [토픽모델링] → [군집화] → [대표뉴스/키워드 요약](3개월) -> 여기부터 하루씩 다시 시작하자!!(하루 뉴스데이터)
# → df로 나머지 정보 다시 불러오기 -> df의 유사도 다시 업데이트하기  → [Top10 추출] -> [저장/전송]

In [None]:

# 1. 전역 설정
# mecab = Mecab()
kiwi = Kiwi()
gtr_ymd = datetime.today().strftime('%y%m%d')  # 예: 250429
SAVE_DIR = './date'

def extract_nouns(text):
    result = kiwi.analyze(text)
    nouns = []
    for token, pos, _, _ in result[0][0]:
        if pos.startswith('N') and len(token) > 1:
            nouns.append(token)
    return nouns

# 2. 뉴스 수집 함수
def collect_news():
    today = datetime.today()
    three_months_ago = today - timedelta(days=90)
    start_date = three_months_ago.strftime('%Y%m%d')
    end_date = today.strftime('%Y%m%d')

    print(f"📅 수집 기간: {start_date} ~ {end_date}")

    keyword = '((고용 || 근로 || 노동 || 일자리) && (임금 || 해고 || 체불 || 피해 || 계약 || 부당 || 신고)) || (산재 || 산업재해 || 직업병 || 요양 || 공상 || 업무상질병 || 근골격계)'
    urlString = 'http://qt.some.co.kr/TrendMap/JSON/ServiceHandler'

    doc_list = []
    for pageNum in range(1, 31):  # 테스트는 1, 실전은 31
        params = {
            'keyword': keyword,
            'startDate': start_date,
            'endDate': end_date,
            'source': 'news ',
            'lang': 'ko',
            'rowPerPage': '500',
            'pageNum': pageNum,
            'orderType': '1',
            'command': 'GetKeywordDocuments'
        }
        try:
            response = requests.post(urlString, data=params)
            response.raise_for_status()
            if 'item' in response.json():
                doc_list += response.json()['item']['documentList']
            else:
                print(f"⚠️ 페이지 {pageNum} 응답에 'item' 없음")
        except Exception as e:
            print(f"❌ 페이지 {pageNum} 수집 실패: {e}")
            continue
        time.sleep(2)

    print("총 수집 건수:", len(doc_list))

    df = pd.DataFrame(doc_list)[['date', 'writerName', 'title', 'content', 'url', 'vks']]
    df.fillna('', inplace=True)
    df['sentence'] = df[['title', 'content']].apply(" ".join, axis=1)
    df['nouns'] = df['sentence'].apply(lambda x: ', '.join(extract_nouns(x)))

    return df
# 3. 토픽 모델링 함수
def extract_lda_topic(df):
    _df = df.copy() 
    corpus = df['nouns'].str.split(', ').tolist()

    # 토픽 개수 설정
    if len(corpus) >= 1000: k = 100
    elif len(corpus) >= 500: k = 70
    elif len(corpus) >= 200: k = 40
    else: k = 20

    
    lda_model = tp.LDAModel(tw=tp.TermWeight.PMI, min_df=3, rm_top=0, k=k, seed=572)

    for lis in corpus:
        if lis != []:
            lda_model.add_doc(lis)

    for i in range(0, 200, 4):
        lda_model.train(4, workers=1)

    word_df = pd.DataFrame()
    top_n = 10
    for i in range(lda_model.k):
        topic_words = [keyword for keyword, score in lda_model.get_topic_words(i, top_n=top_n)]
        topic_scores = [score for keyword, score in lda_model.get_topic_words(i, top_n=top_n)]
        topic_seq = list(range(len(topic_words)))
        _word_df = pd.DataFrame(data={'word_seq': topic_seq, 'word': topic_words, 'pmi_score': topic_scores})
        _word_df['topic_id'] = f'topic_{gtr_ymd}_{i}'
        # _word_df['embassy_cd'] = _df['embassy_cd'].unique()[0]
        _word_df = _word_df[['topic_id', 'word_seq', 'word', 'pmi_score']]
        word_df = pd.concat([word_df, _word_df])

    word_df = word_df.reset_index(drop=True)
    word_df = word_df[['topic_id','word_seq','word','pmi_score']]
    word_df['pmi_score'] = word_df.pmi_score.round(5)

    topic_dist = [lda_model.infer((lda_model.docs[i]))[0] for i in range(len(_df))]

    _df['max_topic_num'] = [vec.argmax() for vec in topic_dist]
    _df['max_topic_value'] = [vec.max() for vec in topic_dist] 

    use_index = _df['max_topic_num'].value_counts().index.tolist()

    topic_items = []
    for i in use_index:
        topic_dict = dict()
        topic_words = [keyword for keyword, score in lda_model.get_topic_words(i, top_n=20) if score >= 0.01]
        if len(topic_words) >= 5:
            topic_dict['mdl_index'] = i
            topic_dict['topic_words'] = ', '.join(topic_words[:5])
            topic_items.append(topic_dict)

    topic_df = pd.DataFrame(topic_items)
    if len(topic_df) == 0:
        return word_df, topic_df

    union_dict = dict()
    mdl_index_list = topic_df['mdl_index'].tolist()
    mdl_index_list.reverse()
    count = 0
    for i in mdl_index_list[1:]:
        count += 1
        target_topic_words = topic_df['topic_words'][topic_df['mdl_index'] == i].tolist()[0].split(', ')

        for j in topic_df['mdl_index'].tolist()[-count:]:
            subject_topic_words = topic_df['topic_words'][topic_df['mdl_index'] == j].tolist()[0].split(', ')
            union = set(target_topic_words) & set(subject_topic_words)
            if len(union) >= 3:
                union_dict[j] = i

    _df['new_topic_num'] = _df['max_topic_num']
    for old, new in union_dict.items():
        _df.loc[_df['max_topic_num'] == old, 'new_topic_num'] = new

    info_df = pd.DataFrame(data={'topic_num':range(lda_model.k)})

    info_df['new_topic_num'] = info_df['topic_num']
    for old, new in union_dict.items():
        info_df.loc[info_df['topic_num'] == old, 'new_topic_num'] = new

    count_df = _df['max_topic_num'].value_counts().reset_index()
    count_df.columns = ['topic_num', 'doc_cnt']
    info_df = info_df.merge(count_df, how='left').fillna(0)
    info_df.doc_cnt = info_df.doc_cnt.astype('int')

    _df = _df[_df['max_topic_value'] >= 0.5].reset_index(drop=True)

    temp_df = _df['max_topic_num'].value_counts()
    use_index = temp_df[temp_df >= 3].index.tolist()

    _df = _df[_df['max_topic_num'].isin(use_index)].reset_index(drop=True)
    _df = _df[_df['max_topic_num'].isin(topic_df['mdl_index'].tolist())].reset_index(drop=True)

    topic_df = topic_df[topic_df['mdl_index'].isin(_df['new_topic_num'].unique().tolist())]

    _df.reset_index(inplace=True)
    _df.rename(columns={'index':'docid'}, inplace=True)

    top_df = pd.DataFrame()
    new_topic_number = 0
    for topic_number in _df['new_topic_num'].value_counts().index:
        temp_df = _df[_df['new_topic_num'] == topic_number].reset_index(drop=True)

        corpus = temp_df['nouns'].tolist()
        docid = temp_df['docid'].tolist()

        # tf-idf 적용
        tfidfv = TfidfVectorizer().fit(corpus)
        vector = tfidfv.transform(corpus).toarray()

        # k값 설정, 학습
        km = KMeans(n_clusters = 1)
        km.fit(vector)

        # 클러스터 결과로 데이터프레임 재구축
        results = []
        clusters = km.labels_.tolist()  # 군집화 결과(라벨)
        for i, value in enumerate(clusters):
            
            result_dict = {}
            result_dict['mdl_index'] = topic_number
            result_dict['title'] = temp_df.loc[temp_df.docid == docid[i]]['title'].tolist()[0]
            result_dict['nouns'] = temp_df.loc[temp_df.docid == docid[i]]['nouns'].tolist()[0]
            result_dict['topic_value'] = temp_df.loc[temp_df.docid == docid[i]]['max_topic_value'].tolist()[0]
            vec1 = vector[i].reshape(1, -1)  # 뉴스 벡터
            vec2 = km.cluster_centers_[int(value)].reshape(1,-1)  # k중앙값 벡터
            result_dict['similarity'] = cosine_similarity(vec1, vec2)[0][0]  # 코사인 유사도 계산
            results.append(result_dict)
        
        result_df = pd.DataFrame(results).sort_values(['mdl_index','similarity'], ascending=[True, False]).reset_index(drop=True)
        top_df = pd.concat([top_df, result_df])

    top_df = top_df[top_df['similarity'] >= 0.2].reset_index(drop=True)

    topic_df['doc_count'] = 0
    for mdl_index, doc_count in top_df['mdl_index'].value_counts().items():
        topic_df.loc[topic_df['mdl_index'] == mdl_index, 'doc_count'] = doc_count

    return top_df, topic_df, info_df, word_df

# 4. 이슈 키워드 추출 
def extract_issue_kwd(top_df, topic_df, info_df):
    stopwords = [
    '통보', '계획', '송부', '보고', '회신', '결과', '제출', '참석', '공지', '안내', '요청', '접수', '확인', 
    '조치', '진행', '관리', '공고', '변경', '개정', '신청', '접수', '확대', '축소', '개선', '이행',
    '회의', '보고서', '문서', '서류', '자료', '파일', '작성', '배포', '확산', '사업', '전달',
    '월', '주년', '분기', '기간', '시행', '오전', '오후', '조사', '평가', '진단', '대책', '방안',
    '제도', '정책', '지원', '프로그램', '참여'
]
    main_keywords = []
    for mdl_index in top_df['mdl_index'].value_counts().index:
        main_keyword_dict = dict()

        # 토픽단어 중 보편적인 단어(stopwords) 제거
        try:
            re_ngram = ', '.join(topic_df['topic_words'][topic_df['mdl_index'] == mdl_index]).split(', ')
            re_ngram = [x for x in re_ngram if x not in stopwords]
        except:
            break
        # 제목에서 ()괄호 삭제
        morph_title = top_df[top_df['mdl_index'] == mdl_index].title.str.replace(r'\(\w.*\)', '', regex=True).tolist()

        keyword_list = []
        for title in morph_title:
            try:
                # 제목에서 명사만 남김(title)
                query = ' '.join(extract_nouns(title))

                # 토픽단어 top1 1개로 쿼리에서 6자 이상 단어 조회
                use_word = r'\w*{}\w*'.format(re_ngram[0])
                keywords = re.findall(use_word, query)
                if keywords != []:
                    keyword = keywords[0]
                    if len(keyword) > 5:
                        keyword_list.append(keyword)

                # 토픽단어 top1~2 2개로 쿼리에서 단어 조회
                use_word = r'\w*{}\w*|\w*{}\w*'.format(re_ngram[0], re_ngram[1])
                keywords = re.findall(use_word, query)
                if len(keywords) == 2:
                    keyword = ' '.join(keywords)
                    keyword_list.append(keyword)

                # 토픽단어 top2 1개로 쿼리에서 6자 이상 단어 조회
                use_word = r'\w*{}\w*'.format(re_ngram[1])
                keywords = re.findall(use_word, query)
                if keywords != []:
                    keyword = keywords[0]
                    if len(keyword) > 5:
                        keyword_list.append(keyword)
                        continue

                # 토픽단어 top1~3, 3개로 쿼리에서 단어 조회
                use_word = r'\w*{}\w*|\w*{}\w*|\w*{}\w*'.format(re_ngram[0], re_ngram[1], re_ngram[2])
                keywords = re.findall(use_word, query)
                if len(keywords) == 2:
                    keyword = ' '.join(keywords)
                    keyword_list.append(keyword)

                # 토픽단어 top1~3 2개로 제목에서 단어 조회
                use_word = r'\w*{}\w*|\w*{}\w*|\w*{}\w*'.format(re_ngram[0], re_ngram[1], re_ngram[2])
                keywords = re.findall(use_word, query)
                if len(keywords) == 2:
                    keyword = ' '.join(keywords)
                    keyword_list.append(keyword)
            except:
                continue
            
        try:
            main_kwd = max([kwd for kwd, cnt in Counter(keyword_list).items() if (cnt == max(Counter(keyword_list).values())) & (cnt >= 2)], key=len)
            main_keyword_dict['mdl_index'] = mdl_index
            main_keyword_dict['main_kwd'] = main_kwd
            main_keywords.append(main_keyword_dict)
        except:
            continue

    issue_df = pd.DataFrame(main_keywords)
    issue_df.columns = ['new_topic_num', 'iss_kwd']
    issue_df = issue_df.merge(info_df.groupby(['new_topic_num'])['doc_cnt'].sum().reset_index(), how='left')
    issue_df = issue_df.sort_values(['doc_cnt', 'iss_kwd'], ascending=[False, True]).reset_index(drop=True)
    issue_df['topic_id'] = f'topic_{gtr_ymd}_' + issue_df.new_topic_num.astype('string')
    issue_df['use_yn'] = 'Y'
    issue_df = issue_df[~issue_df.iss_kwd.duplicated()].reset_index(drop=True)
    issue_df['kwd_rank'] = issue_df.index + 1
    issue_df = issue_df[['topic_id', 'kwd_rank', 'iss_kwd', 'doc_cnt', 'use_yn']]
    issue_df.rename(columns={'iss_kwd':'topic_iss_kwd'}, inplace=True)

    return issue_df

# 5. 대표 뉴스 및 요약 
def summarize_clusters(top_df):
    cluster_summary = []

    for mdl_index in sorted(top_df['mdl_index'].unique()):
        cluster_info = {}

        # 해당 군집 데이터
        cluster_df = top_df[top_df['mdl_index'] == mdl_index]

        # 1. 대표 뉴스 (similarity 가장 높은 제목)
        best_news = cluster_df.sort_values(by='similarity', ascending=False).iloc[0]
        cluster_info['mdl_index'] = mdl_index
        cluster_info['대표뉴스제목'] = best_news['title']

        # 2. 군집 내 명사 모아서 핵심 키워드 5개 추출
        all_nouns = []
        for nouns in cluster_df['nouns']:
            all_nouns += nouns.split(', ')
        noun_counts = Counter(all_nouns)
        top_nouns = [noun for noun, count in noun_counts.most_common(5)]
        cluster_info['대표키워드'] = ', '.join(top_nouns)

        # 3. 군집 평균 similarity
        avg_similarity = cluster_df['similarity'].mean()
        cluster_info['군집평균유사도'] = round(avg_similarity, 4)

        # 4. 뉴스 개수 (doc 수)
        cluster_info['뉴스개수'] = len(cluster_df)

        cluster_summary.append(cluster_info)

    # 전체 군집 요약 데이터프레임
    issue_summary_df = pd.DataFrame(cluster_summary)

    # 🔥 여기서 Top10만 추출
    issue_summary_df = issue_summary_df.sort_values(
        by=['뉴스개수', '군집평균유사도'], 
        ascending=[False, False]
    ).head(10).reset_index(drop=True)

    return issue_summary_df

# 6. 저장 및 전송
def save_and_send(top_df):
    today_str = datetime.today().strftime('%Y%m%d')
    os.makedirs(SAVE_DIR, exist_ok=True)
    final_file_path = os.path.join(SAVE_DIR, f"auto_top10_통합_{today_str}.csv")
    top_df.to_csv(final_file_path, index=False, encoding='utf-8-sig')

    print(f"📁 CSV 저장 완료: {final_file_path}")

    # try:
    #     response = requests.post(
    #         "http://localhost:8087/api/news/save-csv",
    #         json={"filePath": final_file_path}
    #     )
    #     print(f"🚀 서버 응답: {response.text}")
    # except Exception as e:
    #     print(f"❌ 서버 요청 실패: {e}")

# 7. 메인 실행 
def main():
    df = collect_news()
    _df = df.copy()
    top_df, topic_df, info_df, word_df = extract_lda_topic(_df)
    issue_summary_df = summarize_clusters(top_df)

    today_str = datetime.today().strftime('%Y%m%d')
    issue_summary_df.to_csv(f'./date/cluster_summary_{today_str}.csv', index=False, encoding='utf-8-sig')
    print(f"✅ cluster_summary_{today_str}.csv 저장 완료")
    save_and_send(top_df)

if __name__ == "__main__":
    main()


# # 8. 스케줄러
# run_job()  # 처음 1번 실행

# schedule.every().day.at("09:00").do(run_job)

# print("스케줄러 작동 중입니다... (Ctrl+C로 종료)")

# while True:
#     schedule.run_pending()
#     time.sleep(1)   


📅 수집 기간: 20250129 ~ 20250429
총 수집 건수: 15000
✅ cluster_summary_20250429.csv 저장 완료
📁 CSV 저장 완료: ./date\auto_top10_통합_20250429.csv
🚀 서버 응답: ✅ CSV 데이터가 저장되었습니다!
