In [None]:
# -*- coding: utf-8 -*-
import os
import re
import time
import csv
import requests
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager

CRAWLED_LINKS_FILE = 'crawled_links.txt'
crawled_links = set()
os.makedirs('images', exist_ok=True)

try:
    with open(CRAWLED_LINKS_FILE, 'r', encoding='utf-8') as f:
        crawled_links = set(f.read().splitlines())
except FileNotFoundError:
    pass

options = webdriver.ChromeOptions()
options.add_argument('window-size=1200x800')
# options.add_argument('headless')  # 디버깅 중이므로 생략
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# 로그인
id = input('ID: ')
pw = input('PW: ')
driver.get('https://account.everytime.kr/login')
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.NAME, 'id')))
driver.find_element(By.NAME, 'id').send_keys(id)
driver.find_element(By.NAME, 'password').send_keys(pw)
driver.find_element(By.CSS_SELECTOR, 'input[type="submit"]').click()

# 날짜 입력
start_date = input('Start Date (YYYY-MM-DD): ')
end_date = input('End Date (YYYY-MM-DD): ')
start_datetime = datetime.strptime(start_date, "%Y-%m-%d")
end_datetime = datetime.strptime(end_date, "%Y-%m-%d")

# 자연어 날짜 파싱 함수
def parse_natural_date(text):
    now = datetime.now()
    if '상시' in text:
        return '상시'
    if '오늘' in text:
        return now.strftime('%Y-%m-%d')
    if '내일' in text:
        return (now + timedelta(days=1)).strftime('%Y-%m-%d')
    if '모레' in text:
        return (now + timedelta(days=2)).strftime('%Y-%m-%d')
    match = re.search(r'(\d+)\s*시간\s*(후|뒤)', text)
    if match:
        hours = int(match.group(1))
        return (now + timedelta(hours=hours)).strftime('%Y-%m-%d %H:%M')
    if re.search(r'(\d{1,2})시\s*까지', text):
        hour = 18
        if '오전' in text:
            hour = 6
        elif '오후' in text:
            hour = 18
        return now.replace(hour=hour, minute=0).strftime('%Y-%m-%d %H:%M')
    match = re.search(r'(\d{1,2})[./월\s]*(\d{1,2})[일]?', text)
    if match:
        month, day = int(match.group(1)), int(match.group(2))
        year = now.year if month >= now.month else now.year + 1
        return f"{year}-{month:02d}-{day:02d}"
    match = re.search(r'(\d{4})[.\s년]+(\d{1,2})[.\s월]+(\d{1,2})[일]?', text)
    if match:
        year, month, day = map(int, match.groups())
        return f"{year}-{month:02d}-{day:02d}"
    return ''

# 게시글 분석

def analyze_post(title, content, post_date):
    category = "홍보"
    deadline = post_date
    phone = ""
    if "모집" in title or "모집" in content:
        category = "모집"
        parsed = parse_natural_date(content)
        if parsed:
            deadline = parsed
    phone_match = re.search(r'01[0-9]-\d{3,4}-\d{4}', content)
    if phone_match:
        phone = phone_match.group()
    return category, deadline, phone

# 이미지 다운로드

def download_image(img_url):
    try:
        response = requests.get(img_url)
        img = Image.open(BytesIO(response.content))
        width, height = img.size
        if width > height:
            img = img.resize((240, 160))
        else:
            img = img.resize((210, 297))
        filename = f"img_{int(time.time())}.jpg"
        path = os.path.join("images", filename)
        img.save(path)
        return filename
    except:
        return ""

# 크롤링 및 저장

def write_csv(row):
    with open('everytime_output.csv', 'a', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(row)

def crawl_article(url):
    if url in crawled_links:
        return
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'article')))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        article = soup.find('article')
        title = article.h2.text.strip()
        content = article.find('p').text.strip()
        time_text = article.find('time').text.strip()
        post_date = datetime.now().strftime('%Y-%m-%d %H:%M') if '분' in time_text or '방금' in time_text else time_text
        img_tag = article.find('img')
        image_file = download_image(img_tag['src']) if img_tag else ''
        category, deadline, phone = analyze_post(title, content, post_date)
        write_csv([title, content, post_date, category, deadline, phone, image_file])
        with open(CRAWLED_LINKS_FILE, 'a', encoding='utf-8') as f:
            f.write(url + '\n')
        crawled_links.add(url)
        print(f"[✓] {url}")
    except Exception as e:
        print(f"[!] 실패: {url} ({e})")

# 반복 실행 루프

def run():
    school_domain = "kumoh"           # <- 바꿔야 할 부분
    board_id = "418897"               # <- 바꿔야 할 부분
    base_url = f"https://{school_domain}.everytime.kr/{board_id}/p/"
    for page in range(1, 1000):
        try:
            driver.get(base_url + str(page))
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'articles')))
            articles = driver.find_elements(By.CSS_SELECTOR, 'a.article')
            for a in articles:
                href = a.get_attribute('href')
                crawl_article(href)
        except:
            continue

while True:
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] 크롤링 시작")
    run()
    print("10분 대기 중...")
    time.sleep(600)


[2025-05-06 22:17:56] 크롤링 시작
[!] 실패: https://everytime.kr/lecture/view/1671184 (Message: 
Stacktrace:
	GetHandleVerifier [0x00BDD363+60275]
	GetHandleVerifier [0x00BDD3A4+60340]
	(No symbol) [0x00A106F3]
	(No symbol) [0x00A58690]
	(No symbol) [0x00A58A2B]
	(No symbol) [0x00AA0EE2]
	(No symbol) [0x00A7D0D4]
	(No symbol) [0x00A9E6EB]
	(No symbol) [0x00A7CE86]
	(No symbol) [0x00A4C623]
	(No symbol) [0x00A4D474]
	GetHandleVerifier [0x00E28FE3+2467827]
	GetHandleVerifier [0x00E245E6+2448886]
	GetHandleVerifier [0x00E3F80C+2560028]
	GetHandleVerifier [0x00BF3DF5+153093]
	GetHandleVerifier [0x00BFA3BD+179149]
	GetHandleVerifier [0x00BE4BB8+91080]
	GetHandleVerifier [0x00BE4D60+91504]
	GetHandleVerifier [0x00BCFA10+4640]
	BaseThreadInitThunk [0x759A7BA9+25]
	RtlInitializeExceptionChain [0x77E5C2EB+107]
	RtlClearBits [0x77E5C26F+191]
)
[!] 실패: https://everytime.kr/lecture/view/1671184 (Message: 
Stacktrace:
	GetHandleVerifier [0x00BDD363+60275]
	GetHandleVerifier [0x00BDD3A4+60340]
	(No symbol)