In [None]:
# -*- coding: utf-8 -*-
import os
import re
import time
import csv
import sys
import uuid
import requests
import pymysql
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv

# 환경변수 로딩
load_dotenv()
sys.argv = ['notebook', '--clear-csv', '--clear-images', '--reset', '--clear-links']

CRAWLED_LINKS_FILE = 'crawled_links.txt'
START_TIME_FILE = 'start_time.txt'
DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
LOG_DIR = 'logs'
LOG_FILE = os.path.join(LOG_DIR, 'crawl_log.log')
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs('images', exist_ok=True)

def load_official_club_info():
    url = "https://www.kumoh.ac.kr/ko/sub04_01_02.do"
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        response = requests.get(url, headers=headers)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        club_dict = {}

        for area in soup.select("div.contents-area"):
            category_tag = area.select_one("h4.title0401")
            table = area.select_one("div.table-type01")
            if not category_tag or not table:
                continue
            category = category_tag.text.strip()
            for row in table.select("tbody tr"):
                cols = row.select("td")
                if not cols:
                    continue
                name = cols[0].text.strip()
                simplified = re.sub(r'[^\w]', '', name).upper()
                club_dict[name] = {"category": category}
                club_dict[simplified] = {"category": category, "original": name}
        return club_dict
    except Exception as e:
        print(f"[ERROR] 동아리 정보 로딩 실패: {e}")
        return {}

if '--clear-csv' in sys.argv and os.path.exists('everytime_output.csv'):
    os.remove('everytime_output.csv')
    print("🧹 everytime_output.csv 초기화 완료")
if '--clear-images' in sys.argv:
    for img_file in os.listdir('images'):
        img_path = os.path.join('images', img_file)
        if os.path.isfile(img_path):
            os.remove(img_path)
    print("🧹 images 폴더 초기화 완료")
if '--reset' in sys.argv and os.path.exists(START_TIME_FILE):
    os.remove(START_TIME_FILE)
    print("🧹 start_time.txt 초기화 완료")
if '--clear-links' in sys.argv and os.path.exists(CRAWLED_LINKS_FILE):
    os.remove(CRAWLED_LINKS_FILE)
    print("🧹 crawled_links.txt 초기화 완료")

def log(message):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(LOG_FILE, 'a', encoding='utf-8') as f:
        f.write(f"[{timestamp}] {message}\n")
    print(message)

def get_club_id_by_name(cursor, club_name):
    if not club_name:
        return None
    cursor.execute("SELECT clubId FROM Club WHERE clubName LIKE %s", (f"%{club_name}%",))
    result = cursor.fetchone()
    return result[0] if result else None

def write_to_db_by_category(row):
    try:
        conn = pymysql.connect(
            host=os.getenv("DB_HOST"),
            port=int(os.getenv("DB_PORT")),
            user=os.getenv("DB_USER"),
            password=os.getenv("DB_PASSWORD"),
            database=os.getenv("DB_NAME"),
            charset='utf8mb4',
            autocommit=True
        )
        with conn.cursor() as cursor:
            club_id = get_club_id_by_name(cursor, row["club_name"])
            if not club_id:
                log(f"[SKIP] club_name 매칭 실패: {row['club_name']}")
                return

            if row["category"] == "모집":
                sql = """
                    INSERT INTO ClubPromotion (clubId, target, dues, interview, endDate, isRecruiting)
                    VALUES (%s, %s, %s, %s, %s, %s)
                """
                cursor.execute(sql, (
                    club_id,
                    row.get("target", "무관"),
                    row.get("dues", 0),
                    row.get("interview", 0),
                    row["deadline"] if row["deadline"] != "상시" else None,
                    1
                ))
                log(f"✅ ClubPromotion 저장: {row['title'][:20]}")
            else:
                post_num = uuid.uuid4().hex
                cursor.execute("""
                    INSERT INTO Post (postNum, title, content, type, file, `like`, fixaction, date)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                """, (
                    post_num,
                    row["title"],
                    row["content"],
                    4,  # ✅ 홍보 포스터로 고정
                    None,
                    0,
                    0,
                    datetime.now()
                ))
                cursor.execute("""
                    INSERT INTO ClubPost (postNum, clubId)
                    VALUES (%s, %s)
                """, (post_num, club_id))
                for img_path in row.get("images", []):
                    cursor.execute("""
                        INSERT INTO Poster (postNum, img)
                        VALUES (%s, %s)
                    """, (post_num, img_path))
                log(f"✅ Post + ClubPost + Poster 저장: {row['title'][:20]}")
        conn.close()
    except Exception as e:
        log(f"[DB ERROR] 저장 실패: {e}")

def load_start_time():
    if os.path.exists(START_TIME_FILE):
        log("📁 start_time.txt 파일 발견 → 불러옴")
        with open(START_TIME_FILE, 'r') as f:
            return datetime.strptime(f.read().strip(), DATETIME_FORMAT)
    else:
        log("🆕 start_time.txt 없음 → 사용자 입력 필요")
        start_date = input('Start Date (YYYY-MM-DD): ')
        dt = datetime.strptime(start_date, "%Y-%m-%d")
        save_start_time(dt)
        return dt

def save_start_time(dt):
    with open(START_TIME_FILE, 'w') as f:
        f.write(dt.strftime(DATETIME_FORMAT))

def login_everytime(driver):
    id = os.getenv("EVERYTIME_ID")
    pw = os.getenv("EVERYTIME_PW")

    if not id or not pw:
        log("❌ .env에 EVERYTIME_ID 또는 EVERYTIME_PW가 설정되지 않았습니다.")
        return

    # 자동화 탐지 우회 스크립트 삽입
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
        Object.defineProperty(navigator, 'webdriver', {
            get: () => undefined
        })
        """
    })

    while True:
        driver.get("https://account.everytime.kr/login")
        log("🌐 Everytime 로그인 페이지 로딩 중...")
        
        try:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'id'))).send_keys(id)
            driver.find_element(By.NAME, 'password').send_keys(pw)
        except Exception as e:
            log(f"[ERROR] 로그인 입력창 로딩 실패: {e}")
            continue  # 재시도

        try:
            ActionChains(driver).move_to_element(driver.find_element(By.CSS_SELECTOR, "body")).click().perform()
            log("🖱 빈 공간 클릭 완료 (자동화 우회)")
        except Exception as e:
            log(f"[!] 빈 공간 클릭 실패: {e}")
        time.sleep(2)
        log("🔑 로그인 시도 중...")
        driver.find_element(By.CSS_SELECTOR, 'input[type="submit"]').click()
        time.sleep(2)

        try:
            WebDriverWait(driver, 3).until(EC.alert_is_present())
            alert = driver.switch_to.alert
            log(f"⚠️ 경고창 감지: {alert.text}")
            alert.accept()
            log("🔄 로그인 재시도 중...")
            driver.refresh()
            continue
        except:
            # 경고창 없으면 로그인 성공 여부 확인
            if "login" in driver.current_url:
                log("⚠️ 로그인 실패: 경고창은 없지만 로그인 상태 아님 → 재시도")
                continue

            log("✅ 로그인 성공 확인됨")
            break


def write_csv(row):
    file_exists = os.path.exists('everytime_output.csv')
    with open('everytime_output.csv', 'a', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(['작성일시', '제목', '내용', '카테고리', '마감일', '전화번호', 'URL', '동아리명', '이미지들', '동아리분야'])
        writer.writerow(row)

def download_image(img_url):
    try:
        if any(kw in img_url for kw in ["nav.logo", "favicon", "cf-fpi.everytime.kr/0.png"]):
            log(f"[SKIP] 불필요 이미지 필터링됨: {img_url}")
            return ""

        response = requests.get(img_url, timeout=10)
        img = Image.open(BytesIO(response.content))

        if img.mode in ("RGBA", "P"):
            img = img.convert("RGB")

        # ✅ 비율 유지하며 최대 720x1080 안으로 축소
        max_width, max_height = 720, 1080
        orig_w, orig_h = img.size
        ratio = min(max_width / orig_w, max_height / orig_h, 1.0)  # 원본이 작으면 그대로
        new_size = (int(orig_w * ratio), int(orig_h * ratio))
        img = img.resize(new_size, Image.LANCZOS)

        # ✅ 저장 경로
        filename = f"img_{uuid.uuid4().hex}.jpg"
        path = os.path.join("images", filename)

        # ✅ 고화질 JPEG 저장
        img.save(path, format="JPEG", quality=95, optimize=True, progressive=True)

        log(f"🖼 이미지 저장 완료: {filename} ({new_size[0]}x{new_size[1]})")
        return filename

    except Exception as e:
        log(f"[!] 이미지 저장 실패: {img_url} → {e}")
        return ""

def parse_post_time(text):
    now = datetime.now()
    if '방금' in text:
        return now
    if '분 전' in text:
        m = re.search(r'(\d+)', text)
        return now - timedelta(minutes=int(m.group(1))) if m else None
    if re.match(r'\d{2}/\d{2} \d{2}:\d{2}', text):
        month, day, hour, minute = map(int, re.findall(r'\d+', text))
        year = now.year
        dt = datetime(year, month, day, hour, minute)
        if dt > now and month in [11, 12]:
            dt = datetime(year - 1, month, day, hour, minute)
        return dt
    if re.match(r'\d{2}/\d{2}$', text):
        month, day = map(int, re.findall(r'\d+', text))
        year = now.year
        dt = datetime(year, month, day)
        if dt > now and month in [11, 12]:
            dt = datetime(year - 1, month, day)
        return dt
    return None

def extract_deadline(content, post_time):
    content = re.sub(r'01[0-9][-\.\s]?[0-9]{3,4}[-\.\s]?[0-9]{4}', '', content)
    range_patterns = [r'(\d{1,2})[./](\d{1,2})\s*[~\-]\s*(\d{1,2})[./](\d{1,2})']
    for pattern in range_patterns:
        match = re.search(pattern, content)
        if match:
            sm, sd, em, ed = map(int, match.groups())
            try:
                deadline = datetime(post_time.year, em, ed)
                if deadline < post_time:
                    deadline = datetime(post_time.year + 1, em, ed)
                return deadline.strftime("%Y-%m-%d")
            except:
                continue
    patterns = [
        r'(\d{4})년\s*(\d{1,2})월\s*(\d{1,2})일',
        r'(\d{1,2})월\s*(\d{1,2})일',
        r'(\d{1,2})/(\d{1,2})',
        r'(?<!\d)(0[1-9]|1[0-2])\.(0[1-9]|[12][0-9]|3[01])(?!\d)'
    ]
    for pattern in patterns:
        matches = re.findall(pattern, content)
        for match in matches:
            try:
                if len(match) == 3:
                    year, month, day = map(int, match)
                else:
                    month, day = map(int, match)
                    year = post_time.year
                deadline = datetime(year, month, day)
                if deadline < post_time:
                    deadline = datetime(year + 1, month, day)
                return deadline.strftime("%Y-%m-%d")
            except:
                continue
    if '매주' in content or '매일' in content or '정기' in content:
        return None
    return None

def analyze_post(title, content, post_time, official_clubs):
    category = "홍보"
    phone = ""

    if "박람회" in title or "박람회" in content:
        category = "홍보"
    elif "모집" in title or "모집" in content:
        category = "모집"

    phone_match = re.search(r'01[0-9][-]\d{3,4}-\d{4}', content)
    if phone_match:
        phone = phone_match.group()

    combined_text = (title + " " + content)
    simplified_text = re.sub(r'[^\w]', '', combined_text).upper()

    matched_club = ""
    matched_category = ""
    for key, info in official_clubs.items():
        if key in combined_text or key in simplified_text:
            matched_club = info.get("original", key)
            matched_category = info["category"]
            break

    if matched_club:
        club_name = matched_club
    else:
        club_name = ""

    deadline = extract_deadline(content, post_time)
    if category == "홍보":
        deadline = ""

    return category, deadline, phone, club_name, matched_category

def extract_images_excluding_profile(soup):
    img_urls = []
    for article in soup.select('article'):
        all_imgs = article.select('img')
        skip_first = True
        for img in all_imgs:
            src = img.get('src') or img.get('data-src') or img.get('lazy-src')
            if not src:
                continue
            if skip_first or 'picture' in img.get('class', []) or src == 'https://cf-fpi.everytime.kr/0.png':
                log(f"[SKIP] 프로필 이미지 제외됨: {src}")
                skip_first = False
                continue
            if src.startswith('//'):
                src = 'https:' + src
            elif src.startswith('/'):
                src = 'https://everytime.kr' + src
            if re.search(r'\.(png|jpg|jpeg|gif|webp)(\?|$)', src, re.IGNORECASE):
                img_urls.append(src)
    return img_urls

def run(driver, start_datetime, end_datetime):
    official_clubs = load_official_club_info()
    board_id = "418897"
    base_url = f"https://everytime.kr/{board_id}/p/"
    crawled_links = set()
    if os.path.exists(CRAWLED_LINKS_FILE):
        with open(CRAWLED_LINKS_FILE, 'r', encoding='utf-8') as f:
            crawled_links = set(f.read().splitlines())

    for page in range(1, 1000):
        try:
            url = base_url + str(page)
            log(f"[DEBUG] ▶ 페이지 이동: {url}")
            driver.get(url)
            time.sleep(2)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'articles')))
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            articles = soup.select("article")
            valid_post_count = 0
            for article in articles:
                link_tag = article.find("a", href=True)
                if not link_tag or not link_tag.has_attr("href"):
                    continue
                href = link_tag['href']
                if not re.search(r"/v/\d+", href):
                    continue
                time_tag = link_tag.find("time")
                if not time_tag:
                    continue
                post_time = parse_post_time(time_tag.text.strip())
                if not post_time or not (start_datetime <= post_time < end_datetime):
                    continue
                full_url = f"https://everytime.kr{href}"
                if full_url in crawled_links:
                    continue

                driver.get(full_url)
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'article')))
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                post = soup.find("article")
                title = post.h2.text.strip()
                content = post.find('p').get_text(separator='\n').strip()
                category, deadline, phone, club_name, club_category = analyze_post(title, content, post_time, official_clubs)
                if '피험자' in title or '피험자' in content:
                    log(f"[SKIP] 피험자 모집 감지: {title[:20]}... ({club_name})")
                    continue

                semester = f"{post_time.year}-S{1 if post_time.month <= 6 else 2}"
                if f"{semester}|{title.strip()}" in crawled_links:
                    log(f"[SKIP] 제목 완전 일치 중복 감지: {title[:20]}... ({club_name})")
                    continue

                is_similar = False
                for key in crawled_links:
                    if key.startswith(semester):
                        existing_title = key.split('|', 1)[-1]
                        tfidf = TfidfVectorizer().fit_transform([title.strip(), existing_title])
                        sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
                        if sim > 0.85:
                            log(f"[SKIP] 유사 제목 중복 감지 (cos {sim:.2f}): {title[:20]}... ({club_name})")
                            is_similar = True
                            break
                if is_similar:
                    continue

                # 이미지 추출 및 저장
                img_urls = extract_images_excluding_profile(soup)
                saved_imgs = [download_image(img_url) for img_url in img_urls]

                write_to_db_by_category({
                    "title": title,
                    "content": content,
                    "category": category,
                    "deadline": deadline if deadline else "상시",
                    "phone": phone,
                    "url": full_url,
                    "club_name": club_name,
                    "images": saved_imgs
                })

                crawled_links.add(full_url)
                crawled_links.add(f"{semester}|{title.strip()}")
                log(f"📄 {post_time.strftime('%m/%d')} - {title} [{category}] ({club_name}) 저장")
                valid_post_count += 1

            if valid_post_count == 0:
                log("🛑 현재 페이지 모든 게시글이 범위 밖 → 크롤링 종료")
                break

        except Exception as e:
            log(f"[!] 페이지 {page} 처리 실패: {e}")
            continue

def main():
    while True:
        start_datetime = load_start_time()
        end_datetime = datetime.now()
        options = webdriver.ChromeOptions()
        options.add_argument('window-size=1200x800')
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        login_everytime(driver)
        log(f"🌐 크롤링 시작: {end_datetime.strftime(DATETIME_FORMAT)}")
        log(f"📆 크롤링 범위: {start_datetime.strftime(DATETIME_FORMAT)} ~ {end_datetime.strftime(DATETIME_FORMAT)}")
        run(driver, start_datetime, end_datetime)
        save_start_time(end_datetime)
        driver.quit()
        log("⏳ 1시간 대기 중...\n")
        time.sleep(3600)

if __name__ == '__main__':
    main()

🧹 images 폴더 초기화 완료
🧹 start_time.txt 초기화 완료
🆕 start_time.txt 없음 → 사용자 입력 필요
