본 코드는 토픽모델링  및  감성분석  기법을  활용한  언론  기사의  핵심  이슈  및  논조  분석 프로젝트를 위해 필요한 데이터를 크롤링한 코드이다.해당 프로젝트에 필요한 데이터는 뉴스기사 텍스트 데이터로, selenuim을 이용하여 다음 뉴스기사에서 2010년부터 2024년까지의 뉴스 데이터 중 "정치적 올바름" 키워드를 포함한 뉴스기사만을 추출하여 '순번', '제목', '본문', '신문사', '연도'을 파싱하고 데이터프레임 형태로 변환하여 csv 파일로 저장하였다. 2010년부터 2016년의 기사데이터가 다른 연도의 1년치와 그 수가 비슷하여 하나의 데이터프레임으로 병합하였다. 포털을 일반적으로 많이 사용하는 네이버 뉴스가 아닌 다음뉴스 url로 진행한 이유는 네이버는 언론사 코드와 기사 번호를 포함한 복잡한 경로와 파라미터를 사용하는 반면, 다음은 날짜 기반의 간결한 구조를 채택하고 있다. 본문까지 추출해야하는 이번 크롤링의 경우 신문사별로 링크가 구분되지 않고 통일된 다음 뉴스를 사용하는 것이 필요한 html 태그를 파싱하는 데도 간편했기 때문이다. 하지만 네이버와 달리 다음 뉴스는 페이지네이션 방식이 무한스크롤이 아니어서 1페이지씩 넘겨야한다는 특

In [None]:
pip install selenium

In [None]:
pip install pandas

In [None]:
pip install chromedriver-autoinstaller

크롤링 2010.01.01 ~ 2016.12.31

In [None]:
import re
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from datetime import datetime
import pandas as pd

def collect_ids(keyword, start_date, end_date, max_pages=69):
    opts = Options()
    opts.add_argument('--headless')
    opts.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=opts)

    kw = keyword.replace(" ", "+")
    base = (
        "https://search.daum.net/search"
        f"?w=news&nil_search=btn&DA=STC&enc=utf8"
        f"&q={kw}"
        f"&sd={start_date}&ed={end_date}"
        f"&period=u&sort=recency"
    )

    seen, ids = set(), []
    for pg in range(1, max_pages + 1):
        url = f"{base}&p={pg}"
        driver.get(url)
        time.sleep(1.0)  # 대기 시간 증가

        elems = driver.find_elements(By.CSS_SELECTOR, 'a[href*="//v.daum.net/v/"]')
        new = 0
        for e in elems:
            m = re.search(r'/v/(\d+)', e.get_attribute('href'))
            if m:
                aid = m.group(1)
                if aid not in seen:
                    seen.add(aid)
                    ids.append(aid)
                    new += 1

        print(f"▶ p={pg}: 신규 {new}건, 누적 {len(ids)}건")

        # 중복 기사만 나와도 계속 진행
        # 중단 조건 제거

    driver.quit()
    return ids

def parse_article(aid):
    url = f"https://v.daum.net/v/{aid}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=headers, timeout=5)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')

    title = soup.select_one('meta[property="og:title"]')['content'].strip()
    press = soup.select_one('meta[property="og:article:author"]')['content'].strip()

    body = soup.select_one('div.article_view')
    for t in body.select('script, iframe, ins, a, figure'):
        t.decompose()
    content = "\n".join(p.get_text(strip=True) for p in body.select('p[dmcf-ptype="general"]'))

    meta = soup.select_one('meta[property="og:regDate"]')
    if meta and meta.get('content'):
        year = int(meta['content'][:4])
    else:
        span = soup.select_one('span.num_date')
        year = datetime.strptime(span.get_text(strip=True), '%Y. %m. %d. %H:%M').year

    return title, content, press, year

def crawl_daum_segment_fast(
    keyword="정치적 올바름",
    start_date="20100101000000",
    end_date="20161231235959",
    max_pages=69,
    target_count=1000
):
    # 1) ID 수집
    ids = collect_ids(keyword, start_date, end_date, max_pages)
    ids = ids[:target_count]
    print(f"\n🔖 파싱 대상 ID: {len(ids)}건\n")

    # 2) 본문 파싱
    records = []
    for idx, aid in enumerate(ids, start=1):
        try:
            title, content, press, year = parse_article(aid)
            records.append({
                '순번': idx,
                '제목': title,
                '본문': content,
                '신문사': press,
                '연도': year
            })
        except Exception as e:
            print(f"⚠️ [{aid}] 파싱 실패:", e)
        time.sleep(0.2)

    df = pd.DataFrame(records, columns=['순번', '제목', '본문', '신문사', '연도'])
    return df

if __name__ == "__main__":
    df = crawl_daum_segment_fast(
        keyword="정치적 올바름",
        start_date="20100101000000",
        end_date="20161231235959",
        max_pages=69,
        target_count=1000
    )
    print(df)
    df.to_csv("daum_2010_2016_pc_full.csv", index=False, encoding="utf-8-sig")
    print(f"✅ 총 {len(df)}건 크롤링 완료 — daum_2010_2016_pc_full.csv")


크롤링 2017

In [None]:
def collect_ids(keyword, start_date, end_date, max_pages=69):
    opts = Options()
    opts.add_argument('--headless')
    opts.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=opts)

    kw = keyword.replace(" ", "+")
    base = (
        "https://search.daum.net/search"
        f"?w=news&nil_search=btn&DA=STC&enc=utf8"
        f"&q={kw}"
        f"&sd={start_date}&ed={end_date}"
        f"&period=u&sort=recency"
    )

    seen, ids = set(), []
    for pg in range(1, max_pages+1):
        url = f"{base}&p={pg}"
        driver.get(url)
        time.sleep(0.5)

        elems = driver.find_elements(By.CSS_SELECTOR, 'a[href*="//v.daum.net/v/"]')
        new = 0
        for e in elems:
            m = re.search(r'/v/(\d+)', e.get_attribute('href'))
            if m:
                aid = m.group(1)
                if aid not in seen:
                    seen.add(aid)
                    ids.append(aid)
                    new += 1
        print(f"▶ p={pg}: 신규 {new}건, 누적 {len(ids)}건")
        if new == 0 and pg > 5:
            break

    driver.quit()
    return ids

def parse_article(aid):
    url = f"https://v.daum.net/v/{aid}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=headers, timeout=5)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')

    title = soup.select_one('meta[property="og:title"]')['content'].strip()
    press = soup.select_one('meta[property="og:article:author"]')['content'].strip()

    body = soup.select_one('div.article_view')
    for t in body.select('script, iframe, ins, a, figure'):
        t.decompose()
    content = "\n".join(p.get_text(strip=True) for p in body.select('p[dmcf-ptype="general"]'))

    meta = soup.select_one('meta[property="og:regDate"]')
    if meta and meta.get('content'):
        year = int(meta['content'][:4])
    else:
        span = soup.select_one('span.num_date')
        year = datetime.strptime(span.get_text(strip=True), '%Y. %m. %d. %H:%M').year

    return title, content, press, year

def crawl_daum_segment_fast(
    keyword="정치적 올바름",
    start_date="20170101000000",
    end_date="20171231235959",
    max_pages=69,
    target_count=1000
):
    ids = collect_ids(keyword, start_date, end_date, max_pages)
    ids = ids[:target_count]
    print(f"\n🔖 파싱 대상 ID: {len(ids)}건\n")

    records = []
    for idx, aid in enumerate(ids, start=1):
        try:
            title, content, press, year = parse_article(aid)
            records.append({
                '순번': idx,
                '제목': title,
                '본문': content,
                '신문사': press,
                '연도': year
            })
        except Exception as e:
            print(f"⚠️ [{aid}] 파싱 실패:", e)
        time.sleep(0.2)

    df = pd.DataFrame(records, columns=['순번','제목','본문','신문사','연도'])
    return df

if __name__ == "__main__":
    df = crawl_daum_segment_fast(
        keyword="정치적 올바름",
        start_date="20170101000000",
        end_date="20171231235959",
        max_pages=69,
        target_count=1000
    )
    print(df)
    df.to_csv("daum_2017_pc_fast.csv", index=False, encoding="utf-8-sig")
    print(f"✅ 총 {len(df)}건 크롤링 완료 — daum_2017_pc_fast.csv")


크롤링 2018

In [None]:
import re
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from datetime import datetime
import pandas as pd

def collect_ids(keyword, start_date, end_date, max_pages=69):
    opts = Options()
    opts.add_argument('--headless')
    opts.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=opts)

    kw = keyword.replace(" ", "+")
    base = (
        "https://search.daum.net/search"
        f"?w=news&nil_search=btn&DA=STC&enc=utf8"
        f"&q={kw}"
        f"&sd={start_date}&ed={end_date}"
        f"&period=u&sort=recency"
    )

    seen, ids = set(), []
    for pg in range(1, max_pages + 1):
        url = f"{base}&p={pg}"
        driver.get(url)
        time.sleep(0.5)

        elems = driver.find_elements(By.CSS_SELECTOR, 'a[href*="//v.daum.net/v/"]')
        new = 0
        for e in elems:
            m = re.search(r'/v/(\d+)', e.get_attribute('href'))
            if m:
                aid = m.group(1)
                if aid not in seen:
                    seen.add(aid)
                    ids.append(aid)
                    new += 1
        print(f"▶ p={pg}: 신규 {new}건, 누적 {len(ids)}건")
        if new == 0 and pg > 5:
            break

    driver.quit()
    return ids

def parse_article(aid):
    url = f"https://v.daum.net/v/{aid}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=headers, timeout=5)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')

    title = soup.select_one('meta[property="og:title"]')['content'].strip()
    press = soup.select_one('meta[property="og:article:author"]')['content'].strip()

    body = soup.select_one('div.article_view')
    for t in body.select('script, iframe, ins, a, figure'):
        t.decompose()
    content = "\n".join(p.get_text(strip=True) for p in body.select('p[dmcf-ptype="general"]'))

    meta = soup.select_one('meta[property="og:regDate"]')
    if meta and meta.get('content'):
        year = int(meta['content'][:4])
    else:
        span = soup.select_one('span.num_date')
        year = datetime.strptime(span.get_text(strip=True), '%Y. %m. %d. %H:%M').year

    return title, content, press, year

def crawl_daum_segment_fast(
    keyword="정치적 올바름",
    start_date="20180101000000",
    end_date="20181231235959",
    max_pages=69,
    target_count=1000
):
    ids = collect_ids(keyword, start_date, end_date, max_pages)
    ids = ids[:target_count]
    print(f"\n🔖 파싱 대상 ID: {len(ids)}건\n")

    records = []
    for idx, aid in enumerate(ids, start=1):
        try:
            title, content, press, year = parse_article(aid)
            records.append({
                '순번': idx,
                '제목': title,
                '본문': content,
                '신문사': press,
                '연도': year
            })
        except Exception as e:
            print(f"⚠️ [{aid}] 파싱 실패:", e)
        time.sleep(0.2)

    df = pd.DataFrame(records, columns=['순번', '제목', '본문', '신문사', '연도'])
    return df

if __name__ == "__main__":
    df = crawl_daum_segment_fast(
        keyword="정치적 올바름",
        start_date="20180101000000",
        end_date="20181231235959",
        max_pages=69,
        target_count=1000
    )
    print(df)
    df.to_csv("daum_2018_pc_fast.csv", index=False, encoding="utf-8-sig")
    print(f"✅ 총 {len(df)}건 크롤링 완료 — daum_2018_pc_fast.csv")


크롤링 2019

In [None]:
import re
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from datetime import datetime
import pandas as pd

def collect_ids(keyword, start_date, end_date, max_pages=69):
    opts = Options()
    opts.add_argument('--headless')
    opts.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=opts)

    kw = keyword.replace(" ", "+")
    base = (
        "https://search.daum.net/search"
        f"?w=news&nil_search=btn&DA=STC&enc=utf8"
        f"&q={kw}"
        f"&sd={start_date}&ed={end_date}"
        f"&period=u&sort=recency"
    )

    seen, ids = set(), []
    for p in range(1, max_pages + 1):
        url = f"{base}&p={p}"
        driver.get(url)
        time.sleep(0.5)

        elems = driver.find_elements(By.CSS_SELECTOR, 'a[href*="//v.daum.net/v/"]')
        new = 0
        for e in elems:
            m = re.search(r'/v/(\d+)', e.get_attribute('href'))
            if m:
                aid = m.group(1)
                if aid not in seen:
                    seen.add(aid)
                    ids.append(aid)
                    new += 1
        print(f"▶ p={p}: 신규 {new}건, 누적 {len(ids)}건")
        if new == 0 and p > 5:
            break

    driver.quit()
    return ids

def parse_article(aid):
    url = f"https://v.daum.net/v/{aid}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=headers, timeout=5)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')

    title = soup.select_one('meta[property="og:title"]')['content'].strip()
    press = soup.select_one('meta[property="og:article:author"]')['content'].strip()

    body = soup.select_one('div.article_view')
    for t in body.select('script, iframe, ins, a, figure'):
        t.decompose()
    content = "\n".join(p.get_text(strip=True) for p in body.select('p[dmcf-ptype="general"]'))

    meta = soup.select_one('meta[property="og:regDate"]')
    if meta and meta.get('content'):
        year = int(meta['content'][:4])
    else:
        span = soup.select_one('span.num_date')
        year = datetime.strptime(span.get_text(strip=True), '%Y. %m. %d. %H:%M').year

    return title, content, press, year

def crawl_daum_segment_fast(
    keyword="정치적 올바름",
    start_date="20190101000000",
    end_date="20191231235959",
    max_pages=69,
    target_count=1000
):
    ids = collect_ids(keyword, start_date, end_date, max_pages)
    ids = ids[:target_count]
    print(f"\n🔖 파싱 대상 ID: {len(ids)}건\n")

    records = []
    for idx, aid in enumerate(ids, start=1):
        try:
            title, content, press, year = parse_article(aid)
            records.append({
                '순번': idx,
                '제목': title,
                '본문': content,
                '신문사': press,
                '연도': year
            })
        except Exception as e:
            print(f"⚠️ [{aid}] 파싱 실패:", e)
        time.sleep(0.2)

    df = pd.DataFrame(records, columns=['순번', '제목', '본문', '신문사', '연도'])
    return df

if __name__ == "__main__":
    df = crawl_daum_segment_fast(
        keyword="정치적 올바름",
        start_date="20190101000000",
        end_date="20191231235959",
        max_pages=69,
        target_count=1000
    )
    print(df)
    df.to_csv("daum_2019_pc_fast.csv", index=False, encoding="utf-8-sig")
    print(f"✅ 총 {len(df)}건 크롤링 완료 — daum_2019_pc_fast.csv")


크롤링 2020

In [None]:
import re
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from datetime import datetime
import pandas as pd

def collect_ids(keyword, start_date, end_date, max_pages=69):
    opts = Options()
    opts.add_argument('--headless')
    opts.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=opts)

    kw = keyword.replace(" ", "+")
    base = (
        "https://search.daum.net/search"
        f"?w=news&nil_search=btn&DA=STC&enc=utf8"
        f"&cluster=y&cluster_page=1"
        f"&q={kw}"
        f"&sd={start_date}&ed={end_date}"
        f"&period=u&sort=recency"
    )

    seen, ids = set(), []
    for p in range(1, max_pages + 1):
        url = f"{base}&p={p}"
        driver.get(url)
        time.sleep(0.5)

        elems = driver.find_elements(By.CSS_SELECTOR, 'a[href*="//v.daum.net/v/"]')
        new = 0
        for e in elems:
            m = re.search(r'/v/(\d+)', e.get_attribute('href'))
            if m:
                aid = m.group(1)
                if aid not in seen:
                    seen.add(aid)
                    ids.append(aid)
                    new += 1
        print(f"▶ p={p}: 신규 {new}건, 누적 {len(ids)}건")
        if new == 0 and p > 5:
            break

    driver.quit()
    return ids

def parse_article(aid):
    url = f"https://v.daum.net/v/{aid}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=headers, timeout=5)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')

    title = soup.select_one('meta[property="og:title"]')['content'].strip()
    press = soup.select_one('meta[property="og:article:author"]')['content'].strip()

    body = soup.select_one('div.article_view')
    for t in body.select('script, iframe, ins, a, figure'):
        t.decompose()
    content = "\n".join(p.get_text(strip=True) for p in body.select('p[dmcf-ptype="general"]'))

    meta = soup.select_one('meta[property="og:regDate"]')
    if meta and meta.get('content'):
        year = int(meta['content'][:4])
    else:
        span = soup.select_one('span.num_date')
        year = datetime.strptime(span.get_text(strip=True), '%Y. %m. %d. %H:%M').year

    return title, content, press, year

def crawl_daum_segment_fast(
    keyword="정치적 올바름",
    start_date="20200101000000",
    end_date="20201231235959",
    max_pages=69,
    target_count=1000
):
    ids = collect_ids(keyword, start_date, end_date, max_pages)
    ids = ids[:target_count]
    print(f"\n🔖 파싱 대상 ID: {len(ids)}건\n")

    records = []
    for idx, aid in enumerate(ids, start=1):
        try:
            title, content, press, year = parse_article(aid)
            records.append({
                '순번': idx,
                '제목': title,
                '본문': content,
                '신문사': press,
                '연도': year
            })
        except Exception as e:
            print(f"⚠️ [{aid}] 파싱 실패:", e)
        time.sleep(0.2)

    df = pd.DataFrame(records, columns=['순번', '제목', '본문', '신문사', '연도'])
    return df

if __name__ == "__main__":
    df = crawl_daum_segment_fast(
        keyword="정치적 올바름",
        start_date="20200101000000",
        end_date="20201231235959",
        max_pages=69,
        target_count=1000
    )
    print(df)
    df.to_csv("daum_2020_pc_fast.csv", index=False, encoding="utf-8-sig")
    print(f"✅ 총 {len(df)}건 크롤링 완료 — daum_2020_pc_fast.csv")


크롤링 2021

In [None]:
import re
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from datetime import datetime
import pandas as pd

def collect_ids(keyword, start_date, end_date, max_pages=69):
    opts = Options()
    opts.add_argument('--headless')
    opts.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=opts)

    kw = keyword.replace(" ", "+")
    base = (
        "https://search.daum.net/search"
        f"?w=news&nil_search=btn&DA=STC&enc=utf8"
        f"&cluster=y&cluster_page=1"  # ✅ URL 수정 부분
        f"&q={kw}"
        f"&sd={start_date}&ed={end_date}"
        f"&period=u&sort=recency"
    )

    seen, ids = set(), []
    for p in range(1, max_pages + 1):
        url = f"{base}&p={p}"
        driver.get(url)
        time.sleep(0.5)

        elems = driver.find_elements(By.CSS_SELECTOR, 'a[href*="//v.daum.net/v/"]')
        new = 0
        for e in elems:
            m = re.search(r'/v/(\d+)', e.get_attribute('href'))
            if m:
                aid = m.group(1)
                if aid not in seen:
                    seen.add(aid)
                    ids.append(aid)
                    new += 1
        print(f"▶ p={p}: 신규 {new}건, 누적 {len(ids)}건")
        if new == 0 and p > 5:
            break

    driver.quit()
    return ids

def parse_article(aid):
    url = f"https://v.daum.net/v/{aid}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=headers, timeout=(5,10))
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')

    title = soup.select_one('meta[property="og:title"]')['content'].strip()
    press = soup.select_one('meta[property="og:article:author"]')['content'].strip()

    body = soup.select_one('div.article_view')
    for t in body.select('script, iframe, ins, a, figure'):
        t.decompose()
    content = "\n".join(p.get_text(strip=True) for p in body.select('p[dmcf-ptype="general"]'))

    meta = soup.select_one('meta[property="og:regDate"]')
    if meta and meta.get('content'):
        year = int(meta['content'][:4])
    else:
        span = soup.select_one('span.num_date')
        year = datetime.strptime(span.get_text(strip=True), '%Y. %m. %d. %H:%M').year

    return title, content, press, year

def crawl_daum_segment_fast(
    keyword="정치적 올바름",
    start_date="20210101000000",   # ✅ 2021년으로 변경
    end_date="20211231235959",
    max_pages=69,
    target_count=1000
):
    ids = collect_ids(keyword, start_date, end_date, max_pages)
    ids = ids[:target_count]
    print(f"\n🔖 파싱 대상 ID: {len(ids)}건\n")

    records = []
    for idx, aid in enumerate(ids, start=1):
        try:
            title, content, press, year = parse_article(aid)
            records.append({
                '순번': idx,
                '제목': title,
                '본문': content,
                '신문사': press,
                '연도': year
            })
        except Exception as e:
            print(f"⚠️ [{aid}] 파싱 실패:", e)
        time.sleep(0.2)

    df = pd.DataFrame(records, columns=['순번', '제목', '본문', '신문사', '연도'])
    return df

if __name__ == "__main__":
    df = crawl_daum_segment_fast(
        keyword="정치적 올바름",
        start_date="20210101000000",   # ✅ 2021년 시작일
        end_date="20211231235959",     # ✅ 2021년 종료일
        max_pages=69,
        target_count=1000
    )
    print(df)
    df.to_csv("daum_2021_pc_fast.csv", index=False, encoding="utf-8-sig")  # ✅ 저장 파일명도 2021로
    print(f"✅ 총 {len(df)}건 크롤링 완료 — daum_2021_pc_fast.csv")


크롤링 2022

In [None]:
import re
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from datetime import datetime
import pandas as pd

def collect_ids(keyword, start_date, end_date, max_pages=69):
    opts = Options()
    opts.add_argument('--headless')
    opts.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=opts)

    kw = keyword.replace(" ", "+")
    base = (
        "https://search.daum.net/search"
        f"?w=news&nil_search=btn&DA=STC&enc=utf8"
        f"&cluster=y&cluster_page=1"  # ✅ 클러스터 옵션 추가
        f"&q={kw}"
        f"&sd={start_date}&ed={end_date}"
        f"&period=u&sort=recency"
    )

    seen, ids = set(), []
    for p in range(1, max_pages + 1):
        url = f"{base}&p={p}"
        driver.get(url)
        time.sleep(0.5)

        elems = driver.find_elements(By.CSS_SELECTOR, 'a[href*="//v.daum.net/v/"]')
        new = 0
        for e in elems:
            m = re.search(r'/v/(\d+)', e.get_attribute('href'))
            if m:
                aid = m.group(1)
                if aid not in seen:
                    seen.add(aid)
                    ids.append(aid)
                    new += 1
        print(f"▶ p={p}: 신규 {new}건, 누적 {len(ids)}건")
        if new == 0 and p > 5:
            break

    driver.quit()
    return ids

def parse_article(aid):
    url = f"https://v.daum.net/v/{aid}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=headers, timeout=5)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')

    title = soup.select_one('meta[property="og:title"]')['content'].strip()
    press = soup.select_one('meta[property="og:article:author"]')['content'].strip()

    body = soup.select_one('div.article_view')
    for t in body.select('script, iframe, ins, a, figure'):
        t.decompose()
    content = "\n".join(p.get_text(strip=True) for p in body.select('p[dmcf-ptype="general"]'))

    meta = soup.select_one('meta[property="og:regDate"]')
    if meta and meta.get('content'):
        year = int(meta['content'][:4])
    else:
        span = soup.select_one('span.num_date')
        year = datetime.strptime(span.get_text(strip=True), '%Y. %m. %d. %H:%M').year

    return title, content, press, year

def crawl_daum_segment_fast(
    keyword="정치적 올바름",
    start_date="20220101000000",   # ✅ 2022년 시작일
    end_date="20221231235959",     # ✅ 2022년 종료일
    max_pages=69,
    target_count=1000
):
    ids = collect_ids(keyword, start_date, end_date, max_pages)
    ids = ids[:target_count]
    print(f"\n🔖 파싱 대상 ID: {len(ids)}건\n")

    records = []
    for idx, aid in enumerate(ids, start=1):
        try:
            title, content, press, year = parse_article(aid)
            records.append({
                '순번': idx,
                '제목': title,
                '본문': content,
                '신문사': press,
                '연도': year
            })
        except Exception as e:
            print(f"⚠️ [{aid}] 파싱 실패:", e)
        time.sleep(0.2)

    df = pd.DataFrame(records, columns=['순번', '제목', '본문', '신문사', '연도'])
    return df

if __name__ == "__main__":
    df = crawl_daum_segment_fast(
        keyword="정치적 올바름",
        start_date="20220101000000",   # ✅ 2022년 시작일
        end_date="20221231235959",     # ✅ 2022년 종료일
        max_pages=69,
        target_count=1000
    )
    print(df)
    df.to_csv("daum_2022_pc_fast.csv", index=False, encoding="utf-8-sig")  # ✅ 저장 파일명도 2022로
    print(f"✅ 총 {len(df)}건 크롤링 완료 — daum_2022_pc_fast.csv")


크롤링 2023

In [None]:
import re
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from datetime import datetime
import pandas as pd

def collect_ids(keyword, start_date, end_date, max_pages=69):
    opts = Options()
    opts.add_argument('--headless')
    opts.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=opts)

    kw = keyword.replace(" ", "+")
    base = (
        "https://search.daum.net/search"
        f"?w=news&nil_search=btn&DA=STC&enc=utf8"
        f"&cluster=y&cluster_page=1"  # ✅ 클러스터 옵션 포함
        f"&q={kw}"
        f"&sd={start_date}&ed={end_date}"
        f"&period=u&sort=recency"
    )

    seen, ids = set(), []
    for p in range(1, max_pages + 1):
        url = f"{base}&p={p}"
        driver.get(url)
        time.sleep(0.5)

        elems = driver.find_elements(By.CSS_SELECTOR, 'a[href*="//v.daum.net/v/"]')
        new = 0
        for e in elems:
            m = re.search(r'/v/(\d+)', e.get_attribute('href'))
            if m:
                aid = m.group(1)
                if aid not in seen:
                    seen.add(aid)
                    ids.append(aid)
                    new += 1
        print(f"▶ p={p}: 신규 {new}건, 누적 {len(ids)}건")
        if new == 0 and p > 5:
            break

    driver.quit()
    return ids

def parse_article(aid):
    url = f"https://v.daum.net/v/{aid}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=headers, timeout=5)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')

    title = soup.select_one('meta[property="og:title"]')['content'].strip()
    press = soup.select_one('meta[property="og:article:author"]')['content'].strip()

    body = soup.select_one('div.article_view')
    for t in body.select('script, iframe, ins, a, figure'):
        t.decompose()
    content = "\n".join(p.get_text(strip=True) for p in body.select('p[dmcf-ptype="general"]'))

    meta = soup.select_one('meta[property="og:regDate"]')
    if meta and meta.get('content'):
        year = int(meta['content'][:4])
    else:
        span = soup.select_one('span.num_date')
        year = datetime.strptime(span.get_text(strip=True), '%Y. %m. %d. %H:%M').year

    return title, content, press, year

def crawl_daum_segment_fast(
    keyword="정치적 올바름",
    start_date="20230101000000",   # ✅ 2023년 시작일
    end_date="20231231235959",     # ✅ 2023년 종료일
    max_pages=69,
    target_count=1000
):
    ids = collect_ids(keyword, start_date, end_date, max_pages)
    ids = ids[:target_count]
    print(f"\n🔖 파싱 대상 ID: {len(ids)}건\n")

    records = []
    for idx, aid in enumerate(ids, start=1):
        try:
            title, content, press, year = parse_article(aid)
            records.append({
                '순번': idx,
                '제목': title,
                '본문': content,
                '신문사': press,
                '연도': year
            })
        except Exception as e:
            print(f"⚠️ [{aid}] 파싱 실패:", e)
        time.sleep(0.2)

    df = pd.DataFrame(records, columns=['순번', '제목', '본문', '신문사', '연도'])
    return df

if __name__ == "__main__":
    df = crawl_daum_segment_fast(
        keyword="정치적 올바름",
        start_date="20230101000000",   # ✅ 2023년 시작일
        end_date="20231231235959",     # ✅ 2023년 종료일
        max_pages=69,
        target_count=1000
    )
    print(df)
    df.to_csv("daum_2023_pc_fast.csv", index=False, encoding="utf-8-sig")  # ✅ 저장 파일명도 2023으로
    print(f"✅ 총 {len(df)}건 크롤링 완료 — daum_2023_pc_fast.csv")
