In [19]:
pip install --upgrade pandas sqlalchemy

Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting sqlalchemy
  Downloading SQLAlchemy-2.0.36-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.7 kB)
Using cached pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl (11.4 MB)
Downloading SQLAlchemy-2.0.36-cp312-cp312-macosx_11_0_arm64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sqlalchemy, pandas
  Attempting uninstall: sqlalchemy
    Found existing installation: SQLAlchemy 1.4.53
    Uninstalling SQLAlchemy-1.4.53:
      Successfully uninstalled SQLAlchemy-1.4.53
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source

## 1. 라이브러리 불러오기 및 DB Create

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
import sqlite3
import time

# SQLite 데이터베이스 연결
conn = sqlite3.connect('news_data.db')
cursor = conn.cursor()

# 테이블 생성
cursor.execute('''
CREATE TABLE IF NOT EXISTS news (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    date TEXT,
    title TEXT,
    press TEXT,
    author TEXT,
    content TEXT,
    image TEXT,
    url TEXT
)
''')


## DATA Crawling 및 DB 저장

In [25]:

# 웹드라이버 초기화
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.bigkinds.or.kr/v2/news/recentNews.do')
time.sleep(5)  # 페이지 로드를 기다림

# 언론사 필터 버튼 클릭
press_filter_button = driver.find_element(By.CSS_SELECTOR, 'button.btn-press.btn-toggle')
press_filter_button.click()

# '전국일간지' 체크박스 라벨 대기 및 클릭
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'category_provider_group')))
national_press_label = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.CSS_SELECTOR, 'label[for="전국일간지"]'))
)

# JavaScript를 사용하여 라벨 클릭 강제 실행
driver.execute_script("arguments[0].click();", national_press_label)

# 필터 패널 닫기
close_filter_button = driver.find_element(By.CSS_SELECTOR, 'button.close-filter-btn')
close_filter_button.click()

# 변경 사항이 적용되도록 10초간 대기
time.sleep(10)

# 데이터를 저장할 리스트 생성
news_data = []

current_page = 1
max_page = 1  # 원하는 페이지 수 지정

while current_page <= max_page:
    news_blocks = driver.find_elements(By.CSS_SELECTOR, 'div.news-inner')
    for block in news_blocks:
        attempts = 0
        while attempts < 3:
            try:
                # 제목과 출판사 정보 추출
                title = block.find_element(By.CSS_SELECTOR, 'strong.title').text
                publisher = block.find_element(By.CSS_SELECTOR, 'div.info a').text
                news_url = block.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')

                # a 태그 클릭
                block.find_element(By.CSS_SELECTOR, 'a').click()
                
                WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.news-view-body')))
                
                # 날짜, 리포터 이름, 내용 추출
                date_and_reporter = driver.find_elements(By.CSS_SELECTOR, 'ul.info > li')
                news_date = date_and_reporter[0].text if len(date_and_reporter) > 0 else '날짜 정보 없음'
                reporter_name = date_and_reporter[1].text if len(date_and_reporter) > 1 else '기자 정보 없음'
                news_content = driver.find_element(By.CSS_SELECTOR, 'div.news-view-body').text

                # 이미지 추출
                try:
                    image_element = driver.find_element(By.CSS_SELECTOR, 'div.news-view-body img')
                    image_src = image_element.get_attribute('src')
                except NoSuchElementException:
                    image_src = '이미지 없음'

                # 데이터 리스트에 추가
                news_data.append({
                    'Date': news_date,
                    'Title': title,
                    'Press': publisher,
                    'Author': reporter_name,
                    'Content': news_content,
                    'Image': image_src,
                    'URL': news_url,
                })

                # 모달 닫기
                WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.modal-footer > button'))
                ).click()
                time.sleep(2)
                break
            except Exception as e:
                print(f"Error collecting data: {str(e)}. Retrying...") 
                attempts += 1

    if current_page < max_page:
        next_page_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, f'a.page-link[data-page="{current_page + 1}"]'))
        )
        next_page_button.click()
        time.sleep(5)
    current_page += 1

driver.quit()


# 데이터베이스에 저장
for item in news_data:
    cursor.execute('''
    INSERT INTO news (date, title, press, author, content, image, url)
    VALUES (?, ?, ?, ?, ?, ?, ?)
    ''', (item['Date'], item['Title'], item['Press'], item['Author'], 
          item['Content'], item['Image'], item['URL']))

try:
    conn.commit()
    print("데이터가 데이터베이스에 성공적으로 저장되었습니다.")
except Exception as e:
    conn.rollback()
    print(f"데이터베이스 저장 중 오류 발생: {str(e)}")
finally:
    conn.close()


데이터가 데이터베이스에 성공적으로 저장되었습니다.
