## Mas Link

In [None]:
!pip install ipywidgets --upgrade
!pip install jupyter --upgrade
!jupyter nbextension enable --py widgetsnbextension --sys-prefix

In [None]:
import time
import random
import re
import os
import csv
import pandas as pd
from tqdm.notebook import tqdm
from datetime import datetime, timedelta

import urllib.request # URL requests
import urllib.parse # URL parsing

import requests # HTTP requests
from bs4 import BeautifulSoup as bs # HTML parsing

from selenium import webdriver # Chromedriver
from selenium.webdriver.common.by import By # HTML element 선택
from selenium.webdriver.common.keys import Keys # 키보드 키 입력 위함
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # 크롬 옵션 설정
from selenium.common.exceptions import UnexpectedAlertPresentException # 예외 처리 위함
from selenium.common.exceptions import NoSuchElementException, InvalidSessionIdException

In [None]:
driver = webdriver.Chrome() # 크롬드라이버 객체 생성(초기화)
driver.implicitly_wait(3) # 암묵적 대기(3초)

In [None]:
# 키워드
keyword = ['반찬', '식습관']

In [None]:
# url 폴더 생성
if not os.path.exists('url'):
    os.makedirs('url')

In [None]:
# 키워드 검색후 url 수집
for word in keyword:
    url_list=[]
    date_list=[]
    driver.get('https://www.teamblind.com/kr/search/'+word) # '키워드' 검색

    # 추천순 버튼 클릭
    button = driver.find_element(By.XPATH, '//*[@id="wrap"]/section/div/div/div[3]/div[2]/span/a')
    button.click()

    # 최신순 버튼 클릭
    button = driver.find_element(By.XPATH, '//*[@id="search_sort"]/a[2]')
    button.click()

    # 페이지 끝까지 내리기
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(1,1.7))
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break

        last_height = new_height

    # 검색결과 url과 날짜 수집
    url_elements = driver.find_elements(By.CSS_SELECTOR, ".pv")  # .pv: 검색결과의 제목
    date_elements = driver.find_elements(By.CSS_SELECTOR, ".past")  # 날짜를 포함한 요소

    for url_element, date_element in zip(url_elements, date_elements):
        url = url_element.get_attribute('href')  # href(링크 주소) 속성값 가져오기
        date = date_element.text  # 날짜 텍스트 가져오기

        url_list.append(url)
        date_list.append(date)

    # 데이터프레임으로 저장
    url_save = pd.DataFrame({
        'keyword': [word] * len(url_list),
        'url': url_list,
        'date': date_list
    })

    file_path = os.path.join('url', '%s_url.csv' % word)
    url_save.to_csv(file_path, index=False)

## Crawling

In [None]:
import time
import random
import re
import csv
import pandas as pd
import os
from tqdm.notebook import tqdm
from datetime import datetime

import urllib.request # URL requests
import urllib.parse # URL parsing

import requests # HTTP requests
from bs4 import BeautifulSoup as bs # HTML parsing

from selenium import webdriver # Chromedriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By # HTML element 선택
from selenium.webdriver.common.keys import Keys # 키보드 키 입력 위함
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # 크롬 옵션 설정
from selenium.common.exceptions import UnexpectedAlertPresentException # 예외 처리 위함
from selenium.common.exceptions import (
    NoSuchElementException,
    InvalidSessionIdException,
    InvalidArgumentException,
    TimeoutException,
    WebDriverException,
    StaleElementReferenceException
)

In [None]:
# 패턴
pattern_2022_2023 = re.compile(r'^(2022|2023)\.\d{2}\.\d{2}\.?$')
pattern_2024 = re.compile(r'^2024\.(01|02|03|04)\.\d{2}\.?$')

# 날짜 형식 확인 및 변환 함수
def check_and_format_date(date_text):
    # '작성시간' 한글 제거
    date_text = date_text.replace('작성시간', '').strip()

    # '분', '시간', '일'이 포함된 데이터 무시
    if '분' in date_text or '시간' in date_text or '일' in date_text:
        return None

    # 'MM.DD' 형식의 데이터 처리
    if re.match(r'^\d{2}\.\d{2}$', date_text):
        date_text = f"2024.{date_text}"

    # 최종 날짜 형식 처리
    if pattern_2022_2023.match(date_text) or pattern_2024.match(date_text):
        return date_text
    return None

In [None]:
# 키워드
keyword = ['반찬', '식습관']

In [None]:
# url 폴더 생성
if not os.path.exists('url_data_processed'):
    os.makedirs('url_data_processed')

# 크롤링된 CSV 파일을 불러와서 전처리
for word in keyword:
    print('키워드:', word, '전처리 시작')

    # 크롤링된 CSV 파일 불러오기
    url_file_path = os.path.join('url', '%s_url.csv' % word)
    data = pd.read_csv(url_file_path)

    # 날짜 형식 확인 및 변환
    data['date'] = data['date'].apply(check_and_format_date)

    # None 값 제거 (날짜가 지정된 범위에 속하지 않는 데이터 제거)
    data = data.dropna(subset=['date'])

    # 데이터프레임을 CSV 파일로 저장
    processed_file_path = os.path.join('url_data_processed', './%s_url_data_processed.csv' % word)
    data.to_csv(processed_file_path, index=False)

In [None]:
driver = webdriver.Chrome() # 크롬드라이버 객체 생성(초기화)
driver.implicitly_wait(3) # 암묵적 대기(3초)

url = 'https://www.teamblind.com/kr/'
driver.get(url) # URL 접속

# 이 셀까지 진행한 후 드라이버에 직접 로그인(인증번호) 필요!

In [None]:
# url 폴더 생성
if not os.path.exists('data'):
    os.makedirs('data')

In [None]:
# 크롤링 시작
for word in keyword:
    print('키워드:', word, '크롤링 시작')

    # url_csv 불러오기
    processed_file_path = os.path.join('url_data_processed', f'{word}_url_data_processed.csv')
    url_list = pd.read_csv(processed_file_path)

    # 크롤링 결과 저장할 리스트 생성
    pre_board_list = []
    post_board_list = []
    title_list = []
    date_list = []
    content_list = []
    review_list = []
    keyword_list = []

    # url_list 크롤링
    for url in tqdm(url_list['url']):
        try:
            driver.get(url)
            driver.implicitly_wait(3)
            time.sleep(random.uniform(1, 1.7))

            # 날짜 추출
            formatted_date = url_list[url_list['url'] == url]['date']

            # 게시판 이름 추출
            pre_board = driver.find_element(By.XPATH, '//*[@id="wrap"]/section/div/div[2]/div[1]/h1/a[1]')
            pre_board_text = pre_board.text

            try:
                post_board = driver.find_element(By.XPATH, '//*[@id="wrap"]/section/div/div[2]/div[1]/h1/a[2]')
                post_board_text = post_board.text
            except NoSuchElementException:
                post_board_text = None

            # 제목 추출
            title = driver.find_element(By.XPATH, '//*[@id="wrap"]/section/div/div[2]/div[1]/h2')
            title_text = title.text

            # 본문 추출
            content = driver.find_element(By.XPATH, '//*[@id="contentArea"]')
            content_text = content.text

            # 댓글 버튼 클릭
            try:
                if driver.find_element(By.XPATH, '//*[@id="wrap"]/section/div/div[2]/div[4]/button'):
                    while True:
                        driver.find_element(By.XPATH, '//*[@id="wrap"]/section/div/div[1]/div[4]/button').click()
                        time.sleep(0.5)
                        if not driver.find_element(By.XPATH, '//*[@id="wrap"]/section/div/div[1]/div[4]/button'):
                            break
            except:
                pass

            # 대댓글 더보기 클릭
            try:
                if driver.find_element(By.CSS_SELECTOR, '.btn-reply'):
                    re = 0
                    while True:
                        re += 1
                        driver.find_element(By.CSS_SELECTOR, '.btn-reply').click()
                        time.sleep(0.5)
                        if not driver.find_element(By.CSS_SELECTOR, '.btn-reply') or re == 20:
                            break
            except:
                pass

            # 댓글 추출
            review_text = ''
            reviews = driver.find_elements(By.CSS_SELECTOR, '.cmt-txt')
            for review in reviews:
                try:
                    review_content = review.text
                    if review_content != '작성자가 삭제한 댓글입니다.':
                        review_text += ' ' + review_content
                except StaleElementReferenceException:
                    reviews = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.cmt-txt')))

            # 리스트에 추가
            pre_board_list.append(pre_board_text)
            post_board_list.append(post_board_text)
            title_list.append(title_text)
            date_list.append(formatted_date)
            content_list.append(content_text)
            review_list.append(review_text)
            keyword_list.append(word)

        except (InvalidSessionIdException, InvalidArgumentException, WebDriverException, NoSuchElementException) as e:
            print(f"Invalid URL: {url}")
            continue

    # 데이터프레임 생성 및 CSV 저장
    data = pd.DataFrame({
        '키워드': keyword_list,
        '게시판_대분류': pre_board_list,
        '게시판_소분류': post_board_list,
        '제목': title_list,
        '날짜': date_list,
        '본문': content_list,
        '댓글': review_list
    })

    file_path = os.path.join('data', f'{word}_data.csv')
    data.to_csv(file_path, index=False)