In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np

In [3]:
# 웹 페이지 정보를 추출하는 함수 수정
def extract_news_info(df, media_outlet, tag_info):
    kmib_df = df[df['언론사'] == media_outlet]
    news_urls = []  # URL을 저장할 리스트
    news_times = []  # 해당 시간을 저장할 리스트

    for i, row in kmib_df.iterrows():
        url = row['URL']
        print(i + 1, ": ", url)
        try:
            headers = {"User-Agent": "Mozilla/5.0"}
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # 오류가 발생하면 예외를 발생시킴
            news = BeautifulSoup(response.content, 'html.parser', from_encoding='EUC-KR')
            news_text = news.find(tag_info['tag_name'], attrs=tag_info['tag_attrs'])
            if news_text is not None:
                news_urls.append(url)  # URL 추가
                news_times.append(news_text.text)  # 해당 시간 추가
            else:
                print("뉴스 텍스트를 가져올 수 없습니다.")
        except requests.exceptions.RequestException as e:
            print("웹 페이지를 가져오는 중 오류가 발생했습니다:", str(e))
            continue  # 다음 반복으로 넘어감

        time.sleep(3)

    news_data = pd.DataFrame({'URL': news_urls, '시간': news_times})  # 'URL'과 '시간' 열로 데이터프레임 생성
    return news_data


tag_info_dict = {
    "KBS": {"tag_name": "em", "tag_attrs": {"class": "date"}},
    "MBC": {"tag_name": "span", "tag_attrs": {"class": "input"}},
    "OBS": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "SBS": {"tag_name": "div", "tag_attrs": {"class": "date_area"}},
    "YTN": {"tag_name": "span", "tag_attrs": {"class": "time"}},
    "강원도민일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "강원일보": {"tag_name": "span", "tag_attrs": {"class": "date"}},
    "경기일보": {"tag_name": "div", "tag_attrs": {"class": "article_date"}},
    "경남도민일보": {"tag_name": "ul", "tag_attrs": {"class": "no-bullet auto-marbtm-0 line-height-6"}},
    "경상일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "경인일보": {"tag_name": "span", "tag_attrs": {"class": "news-date"}},
    "경향신문": {"tag_name": "div", "tag_attrs": {"class": "byline"}},
    "광주일보": {"tag_name": "div", "tag_attrs": {"class": "read_time"}},
    "광주매일신문": {"tag_name": "ul", "tag_attrs": {"class": "byline"}},
    "국민일보": {"tag_name": "span", "tag_attrs": {"class": "t11"}},
    "국제신문": {"tag_name": "span", "tag_attrs": {"class": "f_news_date"}},
    "내일신문": {"tag_name": "div", "tag_attrs": {"class": "date"}},
    "대전일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "대구일보": {"tag_name": "span", "tag_attrs": {"class": "date"}},
    "동아일보": {"tag_name": "span", "tag_attrs": {"class": "date01"}},
    "디지털타임스": {"tag_name": "span", "tag_attrs": {"class": "url_txt"}},
    "매일경제": {"tag_name": "li", "tag_attrs": {"class": "lasttime"}},
    "매일신문": {"tag_name": "div", "tag_attrs": {"class": "date"}},
    "머니투데이": {"tag_name": "ul", "tag_attrs": {"class": "info2"}},
    "무등일보": {"tag_name": "span", "tag_attrs": {"class": "txt_info"}},
    "문화일보": {"tag_name": "dl", "tag_attrs": {"class": "date"}},
    "부산일보": {"tag_name": "div", "tag_attrs": {"class": "byline"}},
    "서울경제": {"tag_name": "span", "tag_attrs": {"class": "url_txt"}},
    "세계일보": {"tag_name": "p", "tag_attrs": {"class": "viewInfo"}},
    "아시아경제": {"tag_name": "div", "tag_attrs": {"class": "date_box"}},
    "아주경제": {"tag_name": "dd", "tag_attrs": {"class": "date"}},
    "영남일보": {"tag_name": "li", "tag_attrs": {"class": "article-bottom-input"}},
    "울산매일": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "전남일보": {"tag_name": "ul", "tag_attrs": {"class": "byline"}},
    "전북도민일보": {"tag_name": "i", "tag_attrs": {"class": "fa fa-clock-o fa-fw"}},
    "전북일보": {"tag_name": "i", "tag_attrs": {"class": "far fa-clock pr5"}},
    "전자신문": {"tag_name": "time", "tag_attrs": {"class": "date"}},
    "조선일보": {"tag_name": "span", "tag_attrs": {"class": "upDate | flex flex--align-items-end "}},
    "중도일보": {"tag_name": "ul", "tag_attrs": {"class": "view-term"}},
    "중부매일": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "중부일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "중앙일보": {"tag_name": "time", "tag_attrs": {"itemprop": "datePublished"}},
    "충청일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "충청투데이": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "파이낸셜뉴스": {"tag_name": "div", "tag_attrs": {"class": "byline"}},
    "한겨레": {"tag_name": "p", "tag_attrs": {"class": "date-time"}},
    "한국경제": {"tag_name": "span", "tag_attrs": {"class": "txt-date"}},
    "한국일보": {"tag_name": "dl", "tag_attrs": {"class": "wrt-text"}},
    "한라일보": {"tag_name": "ul", "tag_attrs": {"class": "byline"}},
    "헤럴드경제": {"tag_name": "li", "tag_attrs": {"class": "article_date"}},
}
import math

raw_df = pd.read_excel('./대한항공-뉴스데이터_2023.04_05_-시간채우기전(희동).xlsx')
batch_size = 100
total_batches = math.ceil(len(raw_df) / batch_size)
news_data_list = []

for batch_num in range(total_batches):
    start_index = batch_num * batch_size
    end_index = start_index + batch_size
    batch_df = raw_df.iloc[start_index:end_index]
    
    for media_outlet, tag_info in tag_info_dict.items():
        news_data = extract_news_info(batch_df, media_outlet, tag_info)
        news_data_list.append(news_data)

combined_news_data = pd.concat(news_data_list, ignore_index=True)

11 :  https://news.kbs.co.kr/news/view.do?ncd=7640897&amp;ref=DA
13 :  https://news.kbs.co.kr/news/view.do?ncd=7641883&amp;ref=DA
35 :  https://news.kbs.co.kr/news/view.do?ncd=7642054&amp;ref=DA
86 :  https://news.kbs.co.kr/news/view.do?ncd=7642222&amp;ref=DA
61 :  http://www.obsnews.co.kr/news/articleView.html?idxno=1392651
88 :  https://news.sbs.co.kr/news/endPage.do?news_id=N1007140303&plink=ORI&cooper=ETC
2 :  https://www.ytn.co.kr/_ln/0107_202304012111237240
10 :  https://www.ytn.co.kr/_ln/0103_202304020351255839
12 :  https://www.ytn.co.kr/_ln/0103_202304020211082502
34 :  https://www.ytn.co.kr/_ln/0107_202304032213268100
58 :  https://www.ytn.co.kr/_ln/0103_202304031325221462
64 :  https://www.ytn.co.kr/_ln/0102_202304030953291548
72 :  https://www.ytn.co.kr/_ln/0107_202304042001376858
84 :  https://www.ytn.co.kr/_ln/0107_202304040456195495
92 :  https://www.ytn.co.kr/_ln/0115_202304051037411309
98 :  https://www.ytn.co.kr/_ln/0115_202304051144366160
99 :  https://www.ytn.co.kr/

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


922 :  http://www.munhwa.com/news/view.html?no=2023051901071407084001
972 :  http://www.munhwa.com/news/view.html?no=2023051901039910226006
977 :  https://www.busan.com/view/busan/view.php?code=2023051910344819096
911 :  http://www.sedaily.com/NewsView/29PN2CIUS8
946 :  http://www.sedaily.com/NewsView/29PN2F3LKL
958 :  http://www.sedaily.com/NewsView/29PN1KA41R
1000 :  http://www.sedaily.com/NewsView/29PNFUVD5Z
910 :  http://www.segye.com/content/html/2023/05/19/20230519511994.html
917 :  http://www.segye.com/content/html/2023/05/19/20230519511828.html
925 :  http://www.segye.com/content/html/2023/05/19/20230519509489.html
930 :  http://www.segye.com/content/html/2023/05/19/20230519520442.html
937 :  http://www.segye.com/content/html/2023/05/19/20230519520207.html
938 :  http://www.segye.com/content/html/2023/05/19/20230519520205.html
950 :  http://www.segye.com/content/html/2023/05/19/20230519507543.html
964 :  http://www.segye.com/content/html/2023/05/18/20230518520848.html
971 :  ht

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1144 :  http://www.mk.co.kr/article/10745875
1181 :  http://www.mk.co.kr/article/10747115
뉴스 텍스트를 가져올 수 없습니다.
1188 :  http://www.mk.co.kr/article/10746883
뉴스 텍스트를 가져올 수 없습니다.
1191 :  http://www.mk.co.kr/article/10746959
뉴스 텍스트를 가져올 수 없습니다.
1195 :  http://www.mk.co.kr/article/10746956
뉴스 텍스트를 가져올 수 없습니다.
1129 :  https://news.imaeil.com/page/view/2023052509130271656
1154 :  https://news.imaeil.com/page/view/2023052610155021189
1163 :  https://news.imaeil.com/page/view/2023052818081864378
1108 :  http://news.moneytoday.co.kr/view/mtview.php?no=2023052515241493435&type=2
1145 :  http://news.moneytoday.co.kr/view/mtview.php?no=2023052612097054843&type=2
1149 :  http://news.moneytoday.co.kr/view/mtview.php?no=2023052600123193695&type=2
1159 :  http://news.moneytoday.co.kr/view/mtview.php?no=2023052611341149958&type=2
1184 :  http://news.moneytoday.co.kr/view/mtview.php?no=2023052615304278653&type=2
1101 :  http://www.munhwa.com/news/view.html?no=2023052401071405015001
1134 :  http://www.munh

In [4]:
combined_news_data

Unnamed: 0,URL,시간
0,https://news.kbs.co.kr/news/view.do?ncd=764089...,입력 2023.04.02 (12:02)
1,https://news.kbs.co.kr/news/view.do?ncd=764188...,입력 2023.04.03 (20:01)
2,https://news.kbs.co.kr/news/view.do?ncd=764205...,입력 2023.04.03 (22:02)
3,https://news.kbs.co.kr/news/view.do?ncd=764222...,입력 2023.04.04 (06:57)
4,http://www.obsnews.co.kr/news/articleView.html...,\n\n기자명 \n\t\t\t\t\t유성훈\t\t\t\t\n 입력 2023.04.0...
...,...,...
1098,https://www.ajunews.com/view/20230531090327092,입력 2023-05-31 09:07
1099,https://www.ajunews.com/view/20230530145055573,입력 2023-05-31 05:55
1100,http://www.fnnews.com/news/202305310921195203,\n파이낸셜뉴스입력 2023.05.31 09:31수정 2023.05.31 09:31\n
1101,http://biz.heraldcorp.com/view.php?ud=20230531...,2023.05.31 11:20\t\t\t\t\t\t\n\n\n\n\n\n


In [5]:
combined_news_data['시간'] = combined_news_data['시간'].str.replace('\n','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('\t','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('(','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace(')','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('\r','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('입력','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('기자명','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('지면','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('승인','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('댓글 0','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('수정','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('VIEW','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('등록','')
combined_news_data['시간'] = combined_news_data['시간'].str.replace('파이낸셜뉴스','')

In [8]:
combined_news_data.to_csv('./김희동화이팅.csv', encoding='utf-8-sig')