In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os
import re
import concurrent.futures
from tqdm import tqdm

user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 16_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Mobile/15E148 Safari/604.1'
]

BASE_FILENAME = 'alt'
BASE_URL = 'https://gall.dcinside.com/mgallery/board/lists/?id=coin'
VIEW_URL = 'https://gall.dcinside.com/mgallery/board/view/'
TOTAL_POSTS = 100000
SAVE_INTERVAL = 5000
post_id_cache = set()
collected_posts = 0
start_page = 15
end_page = 570
MAX_WORKERS = 10
MAX_RETRIES = 3
MIN_DELAY = 0.1
MAX_DELAY = 0.5
session = requests.Session()

def random_delay():
    time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))

def get_headers():
    return {
        'User-Agent': random.choice(user_agents),
        'Referer': BASE_URL,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0'
    }

def save_to_csv(dataframe, count):
    filename = f"{BASE_FILENAME}.csv" 
    dataframe.to_csv(filename, index=False, encoding='utf-8-sig')
    print(f"{filename} 파일에 {len(dataframe)}개 게시물 저장 완료")

def crawl_post_detail(post_info):
    post_url, post_id = post_info
    for attempt in range(MAX_RETRIES):
        try:
            if attempt > 0:
                time.sleep(random.uniform(0.5, 1.0))
            headers = get_headers()
            response = session.get(post_url, headers=headers, timeout=5)
            if response.status_code != 200:
                continue
            soup = BeautifulSoup(response.text, 'html.parser')
            content_div = soup.select_one('div.write_div')
            content = content_div.text.strip() if content_div else "내용 없음"
            view_count = soup.select_one('span.gall_count')
            view_count = view_count.text.replace('조회 ', '').strip() if view_count else "0"
            comment_count = soup.select_one('span.gall_reply_num')
            comment_count = comment_count.text.replace('댓글 ', '').strip() if comment_count else "0"
            view_count = re.sub(r'[^0-9]', '', view_count)
            comment_count = re.sub(r'[^0-9]', '', comment_count)
            return {
                'post_id': post_id,
                '내용': content,
                '조회수': view_count,
                '댓글갯수': comment_count
            }
        except Exception:
            if attempt == MAX_RETRIES - 1:
                return None
    return None

def calculate_sampling_interval():
    return 7 

def extract_post_info(page):
    for retry in range(MAX_RETRIES):
        try:
            headers = get_headers()
            page_url = f"{BASE_URL}&page={page}&list_num=100&sort_type=N"
            response = session.get(page_url, headers=headers, timeout=5)
            if response.status_code != 200:
                continue
            soup = BeautifulSoup(response.text, 'html.parser')
            posts = soup.select('tr.ub-content')
            post_info_list = []
            for post in posts:
                if 'notice' in post.get('class', []):
                    continue
                post_id_elem = post.select_one('td.gall_num')
                post_id = post_id_elem.text.strip() if post_id_elem else "0"
                if post_id in post_id_cache:
                    continue
                post_title_elem = post.select_one('td.gall_tit a')
                title = post_title_elem.text.strip() if post_title_elem else "제목 없음"
                post_date_elem = post.select_one('td.gall_date')
                date = post_date_elem.text.strip() if post_date_elem else ""
                post_detail_url = None
                if post_title_elem and post_title_elem.get('href'):
                    post_detail_url = post_title_elem['href']
                    if not post_detail_url.startswith('http'):
                        post_detail_url = "https://gall.dcinside.com" + post_detail_url
                if post_detail_url:
                    post_info_list.append({
                        'post_id': post_id,
                        'title': title,
                        'date': date,
                        'url': post_detail_url
                    })
            return post_info_list
        except Exception:
            if retry == MAX_RETRIES - 1:
                return []
            time.sleep(random.uniform(1.0, 2.0))
    return []

def main():
    global collected_posts, post_id_cache
    all_data = []
    existing_files = [f for f in os.listdir('.') if f.startswith(BASE_FILENAME) and f.endswith('.csv')]
    if existing_files:
        latest_file = sorted(existing_files)[-1]
        temp_df = pd.read_csv(latest_file, encoding='utf-8-sig')
        collected_posts = len(temp_df)
        all_data = temp_df.to_dict('records')
        for post_id in temp_df['번호']:
            post_id_cache.add(str(post_id))
    sampling_interval = calculate_sampling_interval()
    print(f"샘플링 간격: {sampling_interval}개마다 1개 수집")
    current_batch_size = collected_posts % SAVE_INTERVAL
    page_ranges = list(range(start_page, end_page + 1))
    start_time = time.time()
    posts_at_start = collected_posts
    try:
        with tqdm(total=TOTAL_POSTS, initial=collected_posts, desc="게시물 크롤링 진행") as pbar:
            for page_chunk_idx in range(0, len(page_ranges), 5):
                if collected_posts >= TOTAL_POSTS:
                    break
                current_pages = page_ranges[page_chunk_idx:page_chunk_idx + 5]
                all_posts_info = []
                with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
                    results = list(executor.map(extract_post_info, current_pages))
                for posts_in_page in results:
                    all_posts_info.extend(posts_in_page)
                selected_posts = []
                for idx, post_info in enumerate(all_posts_info):
                    if (collected_posts + idx) % sampling_interval == 0:
                        selected_posts.append(post_info)
                        post_id_cache.add(post_info['post_id'])
                post_details_to_crawl = [(post['url'], post['post_id']) for post in selected_posts]
                crawled_details = []
                with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
                    future_to_post = {executor.submit(crawl_post_detail, post_detail): post_detail for post_detail in post_details_to_crawl}
                    for future in concurrent.futures.as_completed(future_to_post):
                        result = future.result()
                        if result:
                            crawled_details.append(result)
                for detail in crawled_details:
                    matching_post = next((post for post in selected_posts if post['post_id'] == detail['post_id']), None)
                    if matching_post:
                        new_row = {
                            '번호': matching_post['post_id'],
                            '날짜': matching_post['date'],
                            '제목': matching_post['title'],
                            '내용': detail['내용'],
                            '조회수': detail['조회수'],
                            '댓글갯수': detail['댓글갯수']
                        }
                        all_data.append(new_row)
                        collected_posts += 1
                        current_batch_size += 1
                        pbar.update(1)
                        if current_batch_size >= SAVE_INTERVAL:
                            df = pd.DataFrame(all_data)
                            save_to_csv(df, collected_posts)
                            current_batch_size = 0
                current_time = time.time()
                elapsed_time = current_time - start_time
                if elapsed_time > 0:
                    posts_collected = collected_posts - posts_at_start
                    posts_per_minute = (posts_collected / elapsed_time) * 60
                    print(f"현재 크롤링 속도: {posts_per_minute:.2f} 게시물/분 (총 {posts_collected}개 수집, {elapsed_time:.1f}초 소요)")
                time.sleep(0.5)
    except KeyboardInterrupt:
        print("사용자에 의해 크롤링이 중단되었습니다.")
    finally:
        if all_data and current_batch_size > 0:
            df = pd.DataFrame(all_data)
            save_to_csv(df, collected_posts)
        total_time = time.tㅁime() - start_time
        posts_collected = collected_posts - posts_at_start
        if total_time > 0:
            posts_per_minute = (posts_collected / total_time) * 60
            print(f"평균 크롤링 속도: {posts_per_minute:.2f} 게시물/분")
            print(f"총 소요 시간: {total_time/60:.2f}분 ({total_time:.1f}초)")
        print(f"크롤링 완료: 총 {collected_posts}개 게시물 수집")

if __name__ == "__main__":
    main()
