In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np

# 웹 페이지 정보를 추출하는 함수 수정
def extract_news_info(df, media_outlet, tag_info):
    kmib_df = df[df['언론사'] == media_outlet]
    news_urls = []  # URL을 저장할 리스트
    news_times = []  # 해당 시간을 저장할 리스트

    for i, row in kmib_df.iterrows():
        url = row['URL']
        print(i + 1, ": ", url)
        try:
            headers = {"User-Agent": "Mozilla/5.0"}
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # 오류가 발생하면 예외를 발생시킴
            news = BeautifulSoup(response.content, 'html.parser', from_encoding='EUC-KR')
            news_text = news.find(tag_info['tag_name'], attrs=tag_info['tag_attrs'])
            if news_text is not None:
                news_urls.append(url)  # URL 추가
                news_times.append(news_text.text)  # 해당 시간 추가
            else:
                print("뉴스 텍스트를 가져올 수 없습니다.")
        except requests.exceptions.RequestException as e:
            print("웹 페이지를 가져오는 중 오류가 발생했습니다:", str(e))
            continue  # 다음 반복으로 넘어감

        time.sleep(3)

    news_data = pd.DataFrame({'URL': news_urls, '시간': news_times})  # 'URL'과 '시간' 열로 데이터프레임 생성
    return news_data


tag_info_dict = {
    "KBS": {"tag_name": "em", "tag_attrs": {"class": "date"}},
    "MBC": {"tag_name": "span", "tag_attrs": {"class": "input"}},
    "OBS": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "SBS": {"tag_name": "div", "tag_attrs": {"class": "date_area"}},
    "YTN": {"tag_name": "span", "tag_attrs": {"class": "time"}},
    "강원도민일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "강원일보": {"tag_name": "span", "tag_attrs": {"class": "date"}},
    "경기일보": {"tag_name": "div", "tag_attrs": {"class": "article_date"}},
    "경남도민일보": {"tag_name": "ul", "tag_attrs": {"class": "no-bullet auto-marbtm-0 line-height-6"}},
    "경상일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "경인일보": {"tag_name": "span", "tag_attrs": {"class": "news-date"}},
    "경향신문": {"tag_name": "div", "tag_attrs": {"class": "byline"}},
    "광주일보": {"tag_name": "div", "tag_attrs": {"class": "read_time"}},
    "광주매일신문": {"tag_name": "ul", "tag_attrs": {"class": "byline"}},
    "국민일보": {"tag_name": "span", "tag_attrs": {"class": "t11"}},
    "국제신문": {"tag_name": "span", "tag_attrs": {"class": "f_news_date"}},
    "내일신문": {"tag_name": "div", "tag_attrs": {"class": "date"}},
    "대전일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "대구일보": {"tag_name": "span", "tag_attrs": {"class": "date"}},
    "동아일보": {"tag_name": "span", "tag_attrs": {"class": "date01"}},
    "디지털타임스": {"tag_name": "span", "tag_attrs": {"class": "url_txt"}},
    "매일경제": {"tag_name": "li", "tag_attrs": {"class": "lasttime"}},
    "매일신문": {"tag_name": "div", "tag_attrs": {"class": "date"}},
    "머니투데이": {"tag_name": "ul", "tag_attrs": {"class": "info2"}},
    "무등일보": {"tag_name": "span", "tag_attrs": {"class": "txt_info"}},
    "문화일보": {"tag_name": "dl", "tag_attrs": {"class": "date"}},
    "부산일보": {"tag_name": "div", "tag_attrs": {"class": "byline"}},
    "서울경제": {"tag_name": "span", "tag_attrs": {"class": "url_txt"}},
    "세계일보": {"tag_name": "p", "tag_attrs": {"class": "viewInfo"}},
    "아시아경제": {"tag_name": "div", "tag_attrs": {"class": "date_box"}},
    "아주경제": {"tag_name": "dd", "tag_attrs": {"class": "date"}},
    "영남일보": {"tag_name": "li", "tag_attrs": {"class": "article-bottom-input"}},
    "울산매일": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "전남일보": {"tag_name": "ul", "tag_attrs": {"class": "byline"}},
    "전북도민일보": {"tag_name": "i", "tag_attrs": {"class": "fa fa-clock-o fa-fw"}},
    "전북일보": {"tag_name": "i", "tag_attrs": {"class": "far fa-clock pr5"}},
    "전자신문": {"tag_name": "time", "tag_attrs": {"class": "date"}},
    "조선일보": {"tag_name": "span", "tag_attrs": {"class": "upDate | flex flex--align-items-end "}},
    "중도일보": {"tag_name": "ul", "tag_attrs": {"class": "view-term"}},
    "중부매일": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "중부일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "중앙일보": {"tag_name": "time", "tag_attrs": {"itemprop": "datePublished"}},
    "충청일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "충청투데이": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "파이낸셜뉴스": {"tag_name": "div", "tag_attrs": {"class": "byline"}},
    "한겨레": {"tag_name": "p", "tag_attrs": {"class": "date-time"}},
    "한국경제": {"tag_name": "span", "tag_attrs": {"class": "txt-date"}},
    "한국일보": {"tag_name": "dl", "tag_attrs": {"class": "wrt-text"}},
    "한라일보": {"tag_name": "ul", "tag_attrs": {"class": "byline"}},
    "헤럴드경제": {"tag_name": "li", "tag_attrs": {"class": "article_date"}},
}
import math

raw_df = pd.read_excel('./테스트데이터/(테스트)유가뉴스데이터.xlsx')
batch_size = 100
total_batches = math.ceil(len(raw_df) / batch_size)
news_data_list = []

for batch_num in range(total_batches):
    start_index = batch_num * batch_size
    end_index = start_index + batch_size
    batch_df = raw_df.iloc[start_index:end_index]
    
    for media_outlet, tag_info in tag_info_dict.items():
        news_data = extract_news_info(batch_df, media_outlet, tag_info)
        news_data_list.append(news_data)

combined_news_data = pd.concat(news_data_list, ignore_index=True)

93 :  https://news.kbs.co.kr/news/view.do?ncd=7696403&amp;ref=DA
9 :  https://imnews.imbc.com/replay/2023/nwtoday/article/6497837_36207.html
79 :  https://imnews.imbc.com/replay/2023/nw1200/article/6492999_36170.html
18 :  https://www.ytn.co.kr/_ln/0102_202306242224434285
20 :  https://www.ytn.co.kr/_ln/0102_202306241115547118
30 :  https://www.ytn.co.kr/_ln/0102_202306210944400161
51 :  https://www.ytn.co.kr/_ln/0102_202306191400360747
52 :  https://www.kwnews.co.kr/page/view/2023061811351690568
83 :  https://www.kyeonggi.com/article/20230612580020
84 :  https://www.kyeonggi.com/article/20230612580020
33 :  https://www.khan.co.kr/world/world-general/article/202306201645001


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


41 :  https://www.khan.co.kr/economy/economy-general/article/202306192049005


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


76 :  https://www.khan.co.kr/world/america/article/202306132254005


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


10 :  http://www.kookje.co.kr/news2011/asp/newsbody.asp?code=1700&key=20230627.22022007768
23 :  http://www.naeil.com/news_view/?id_art=464751
67 :  http://www.naeil.com/news_view/?id_art=463839
25 :  https://www.donga.com/news/Economy/article/all/20230621/119881218/1
38 :  https://www.donga.com/news/Opinion/article/all/20230619/119842414/1
22 :  http://www.dt.co.kr/contents.html?article_no=2023062202109932049003&ref=jeadan
3 :  http://www.mk.co.kr/article/10772439
뉴스 텍스트를 가져올 수 없습니다.
19 :  http://www.mk.co.kr/article/10768178
뉴스 텍스트를 가져올 수 없습니다.
27 :  http://www.mk.co.kr/article/10765832
뉴스 텍스트를 가져올 수 없습니다.
29 :  http://www.mk.co.kr/article/10765269
뉴스 텍스트를 가져올 수 없습니다.
44 :  http://www.mk.co.kr/article/10763823
뉴스 텍스트를 가져올 수 없습니다.
53 :  http://www.mk.co.kr/article/10762906
뉴스 텍스트를 가져올 수 없습니다.
54 :  http://www.mk.co.kr/article/10761588
뉴스 텍스트를 가져올 수 없습니다.
55 :  http://www.mk.co.kr/article/10761389
뉴스 텍스트를 가져올 수 없습니다.
58 :  http://www.mk.co.kr/article/10761329
뉴스 텍스트를 가져올 수 없습니다.
61 :  

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


28 :  http://www.munhwa.com/news/view.html?no=2023062101031805086002
69 :  http://www.munhwa.com/news/view.html?no=2023061401071405054001
80 :  http://www.munhwa.com/news/view.html?no=2023061301070805015001
98 :  http://www.munhwa.com/news/view.html?no=2023060901070805054001
82 :  https://www.busan.com/view/busan/view.php?code=2023061307541908123
86 :  https://www.busan.com/view/busan/view.php?code=2023061209181138093
100 :  https://www.busan.com/view/busan/view.php?code=2023060909083319026
14 :  http://www.sedaily.com/NewsView/29QZRRA19J
71 :  http://www.sedaily.com/NewsView/29QUO1T5AX
26 :  http://www.segye.com/content/html/2023/06/21/20230621520267.html
34 :  http://www.segye.com/content/html/2023/06/19/20230619520755.html
35 :  http://www.segye.com/content/html/2023/06/19/20230619522299.html
43 :  http://www.segye.com/content/html/2023/06/19/20230619518554.html
70 :  http://www.segye.com/content/html/2023/06/14/20230614508119.html
87 :  http://www.segye.com/content/html/2023/06/12/

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


122 :  https://www.khan.co.kr/economy/finance/article/202306050830011


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


142 :  https://www.khan.co.kr/economy/economy-general/article/202306010739001


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


135 :  http://www.naeil.com/news_view/?id_art=462782
116 :  http://www.mk.co.kr/article/10753216
뉴스 텍스트를 가져올 수 없습니다.
119 :  http://www.mk.co.kr/article/10752680
뉴스 텍스트를 가져올 수 없습니다.
104 :  http://news.moneytoday.co.kr/view/mtview.php?no=2023060713372547246&type=2
125 :  http://news.moneytoday.co.kr/view/mtview.php?no=2023060507071368538&type=2
130 :  http://news.moneytoday.co.kr/view/mtview.php?no=2023060411004731890&type=2
134 :  http://news.moneytoday.co.kr/view/mtview.php?no=2023060215403079953&type=2
132 :  https://www.busan.com/view/busan/view.php?code=2023060308370603061
133 :  https://www.busan.com/view/busan/view.php?code=2023060308231902989
117 :  http://www.sedaily.com/NewsView/29QQKP4XEC
101 :  http://www.segye.com/content/html/2023/06/09/20230609504979.html
109 :  http://www.segye.com/content/html/2023/06/05/20230605512922.html
129 :  https://view.asiae.co.kr/article/2023060412315279438
137 :  https://www.ajunews.com/view/20230602084550808
143 :  https://www.ajunews.com/view

In [2]:
combined_news_data

Unnamed: 0,URL,시간
0,https://news.kbs.co.kr/news/view.do?ncd=769640...,입력 2023.06.11 (12:03)
1,https://imnews.imbc.com/replay/2023/nwtoday/ar...,입력 \r\n\t\t\t\t\t\t\t\t\t\t2023-06-28 07:40\r\...
2,https://imnews.imbc.com/replay/2023/nw1200/art...,입력 \r\n\t\t\t\t\t\t\t\t\t\t2023-06-13 12:19\r\...
3,https://www.ytn.co.kr/_ln/0102_202306242224434285,2023년 06월 24일 22시 24분
4,https://www.ytn.co.kr/_ln/0102_202306241115547118,2023년 06월 24일 11시 15분
...,...,...
119,https://www.joongang.co.kr/article/25167611,
120,http://www.fnnews.com/news/202306050810081859,\n파이낸셜뉴스입력 2023.06.05 08:27수정 2023.06.05 08:27\n
121,http://www.fnnews.com/news/202306011807254776,\n파이낸셜뉴스입력 2023.06.01 18:07수정 2023.06.01 18:07\n
122,http://www.fnnews.com/news/202306011520340551,\n파이낸셜뉴스입력 2023.06.01 16:34수정 2023.06.01 16:34\n


In [3]:
combined_news_data['시간'] = combined_news_data['시간'].str.replace('\n','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('\t','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('(','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace(')','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('\r','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('입력','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('기자명','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('지면','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('승인','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('댓글 0','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('수정','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('VIEW','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('등록','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('파이낸셜뉴스','')

In [4]:
combined_news_data.to_excel('./(테스트)유가시간크롤링.xlsx')