In [1]:
import os
import re
import json
import time
import glob
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

# [0] 드라이버 세팅
def setup_driver(headless=True):
    opts = Options()
    if headless:
        opts.add_argument("--headless")
        opts.add_argument("--disable-gpu")
        opts.add_argument("--no-sandbox")
    return webdriver.Chrome(options=opts)

# [1] 탭 클릭
def click_tab(driver, tab_text):
    try:
        a = driver.find_element(By.XPATH, f"//ul[contains(@class,'prd_detail_tab')]//a[text()='{tab_text}']")
        a.click()
        time.sleep(1)
        return True
    except:
        for a in driver.find_elements(By.CSS_SELECTOR, ".prd_detail_tab a"):
            if tab_text in a.text:
                a.click()
                time.sleep(1)
                return True
    return False

# [2] 미디어 파싱
def parse_product_media(html):
    soup = BeautifulSoup(html, "html.parser")
    media_urls = []
    for tag in soup.select("div.detail_area *"):
        src = tag.get("src") or tag.get("srcset")
        if src and src.startswith("https://"):
            media_urls.append(src)
    return media_urls

# [3] 제품 상세 파싱
def parse_product_detail(html):
    soup = BeautifulSoup(html, "html.parser")
    name = soup.select_one("p.prd_name")
    brand = soup.select_one("p.prd_brand a")
    old_price = soup.select_one("span.price-1 strike")
    sale_price = soup.select_one("span.price-2 strong")
    img = soup.select_one("#mainImg")
    ing, origin = "", ""

    artc = soup.find("div", id="artcInfo")
    if artc:
        for dl in artc.select("dl.detail_info_list"):
            dt, dd = dl.find("dt"), dl.find("dd")
            if dt and dd:
                title = dt.get_text(strip=True)
                value = dd.get_text(" ", strip=True)
                if "모든 성분" in title:
                    ing = value
                elif "제조국" in title:
                    origin = value

    options = []
    for li in soup.select("ul.sel_option_list > li[optgoodsinfo]"):
        txt = li.select_one("span.txt").get_text(strip=True)
        code, no = li["optgoodsinfo"].split(":")
        lgc = li.find("input", {"name": "gdasLgcGoodsNo"})["value"]
        options.append({"옵션명": txt, "상품코드": code, "아이템번호": no, "lgcGoodsNo": lgc})

    media_list = parse_product_media(html)

    return {
        "상품명": name.get_text(strip=True) if name else "",
        "브랜드": brand.get_text(strip=True) if brand else "",
        "정가": old_price.get_text(strip=True) if old_price else "",
        "할인가": sale_price.get_text(strip=True) if sale_price else "",
        "이미지": img["src"] if img else "",
        "성분": ing, "제조국": origin,
        "옵션개수": len(options),
        "옵션리스트": options,
        "상세미디어목록": media_list
    }

# [4] 리뷰 파싱
def parse_reviews(html, max_count=10):
    soup = BeautifulSoup(html, "html.parser")
    reviews = []
    poll_data = {}

    ul = soup.find("ul", id="gdasList")
    if not ul:
        return reviews

    for li in ul.find_all("li", recursive=False)[:max_count]:
        reviewer = li.select_one("p.info_user a.id")
        rating = li.select_one("span.point")
        score_span = li.select_one("div.score_area span.point")
        date = li.select_one("span.date")
        opt = li.select_one("p.item_option")
        txt = li.select_one("div.txt_inner")
        rec = li.select_one(".recom_area span.num")

        review_data = {
            "리뷰어": reviewer.get_text(strip=True) if reviewer else "",
            "평점": re.search(r"([\d\.]+)점", rating.get_text(strip=True)).group(1) if rating else "",
            "최대평점": re.search(r"(\d+)점만점에", score_span.get_text(strip=True)).group(1) if score_span else "",
            "날짜": date.get_text(strip=True) if date else "",
            "옵션": opt.get_text(strip=True).replace("[옵션]", "") if opt else "",
            "본문": txt.get_text(" ", strip=True) if txt else "",
            "추천수": rec.get_text(strip=True) if rec else "0",
            "태그": [s.get_text(strip=True) for s in li.select(".review_tag span")],
            "사용자 피부 정보": [s.get_text(strip=True) for s in li.select("p.tag span")]
        }

        # [설문 블록 정보 추출] (예: 발색력 아주 만족해요 등)
        poll_dl_tags = li.select("div.poll_sample dl.poll_type1")
        for dl in poll_dl_tags:
            dt_tag = dl.select_one("dt span")
            dd_tag = dl.select_one("dd span")
            if dt_tag and dd_tag:
                title = dt_tag.get_text(strip=True)
                value = dd_tag.get_text(strip=True)
                review_data[title] = value

        reviews.append(review_data)
    return reviews

# [5] 상품 + 리뷰 + 설문 크롤링
def crawl_products_and_reviews(urls, max_review_count=1000, headless=True):
    driver = setup_driver(headless)
    product_data, review_data = [], []

    for idx, url in enumerate(urls, 1):
        print(f"[{idx}/{len(urls)}] 크롤링 중: {url}")
        driver.get(url)
        time.sleep(1)

        prod_info = {}
        if click_tab(driver, "구매정보"):
            prod_info = parse_product_detail(driver.page_source)

        if click_tab(driver, "리뷰"):
            time.sleep(1)
            soup = BeautifulSoup(driver.page_source, "html.parser")

            # 피부타입 분포 (발색력, 지속력 등)
            for dl in soup.select("dl.poll_type2.type3"):
                dt = dl.select_one("dt span")
                dd = dl.select_one("dd")
                if dt and dd:
                    title = dt.get_text(strip=True)
                    dist = {}
                    for li in dd.select("li"):
                        label = li.select_one("span")
                        percent = li.select_one("em")
                        if label and percent:
                            dist[label.get_text(strip=True)] = percent.get_text(strip=True)
                    if dist:
                        prod_info[title] = json.dumps({title: dist}, ensure_ascii=False)

            total_reviews, current_page = 0, 1
            while total_reviews < max_review_count:
                time.sleep(1)
                reviews = parse_reviews(driver.page_source, max_count=10)
                if not reviews:
                    break
                for rv in reviews:
                    review_data.append({"상품명": prod_info.get("상품명", ""), "리뷰순번": total_reviews + 1, **rv})
                    total_reviews += 1
                    if total_reviews >= max_review_count:
                        break
                try:
                    next_button = driver.find_element(By.CSS_SELECTOR, f'div.pageing a[data-page-no="{current_page + 1}"]')
                    driver.execute_script("arguments[0].click();", next_button)
                    current_page += 1
                except:
                    break

        product_data.append(prod_info)

    driver.quit()

    product_df = pd.DataFrame(product_data)
    review_df = pd.DataFrame(review_data)

    product_df["상세미디어목록"] = product_df["상세미디어목록"].apply(lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, list) else "[]")
    product_df["옵션리스트"] = product_df["옵션리스트"].apply(lambda x: json.dumps(x, ensure_ascii=False) if x else "[]")
    review_df["태그"] = review_df["태그"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)
    review_df["사용자 피부 정보"] = review_df["사용자 피부 정보"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)

    
    return product_df, review_df

# [6] 실행
if __name__ == "__main__":
    folder_path = "/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/선크림 카테고리"
    save_dir = "/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/선크림"
    csv_files = sorted(glob.glob(os.path.join(folder_path, "*_상품URL목록.csv")))

    for csv_path in csv_files:
        try:
            category_name = os.path.basename(csv_path).split("_상품URL")[0]
            print(f"\n▶ 크롤링 시작: {category_name}")
            df_urls = pd.read_csv(csv_path)
            urls = df_urls["url"].dropna().unique().tolist()[:2]
            product_df, review_df = crawl_products_and_reviews(urls, max_review_count=5, headless=True)

            product_df.to_csv(f"{save_dir}/{category_name}_상품정보.csv", index=False, encoding="utf-8-sig")
            review_df.to_csv(f"{save_dir}/{category_name}_리뷰정보.csv", index=False, encoding="utf-8-sig")
            print(f"완료: {category_name} | 상품 {len(product_df)}개, 리뷰 {len(review_df)}개")

        except Exception as e:
            print(f"에러 발생 ({csv_path}): {e}")


▶ 크롤링 시작: 선크림_선스틱
[1/2] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000150624
[2/2] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000202346
완료: 선크림_선스틱 | 상품 2개, 리뷰 10개

▶ 크롤링 시작: 선크림_선스프레이-선패치
[1/2] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000175125
[2/2] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000224981
완료: 선크림_선스프레이-선패치 | 상품 2개, 리뷰 10개

▶ 크롤링 시작: 선크림_선쿠션
[1/2] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000167182
[2/2] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000188223
완료: 선크림_선쿠션 | 상품 2개, 리뷰 5개

▶ 크롤링 시작: 선크림_태닝-애프터선
[1/2] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000198631
[2/2] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000168093
완료: 선크림_태닝-애프터선 | 상품 2개, 리뷰 10개


In [11]:
import os
import re
import json
import time
import glob
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

folder_path = "/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/메이크업 카테고리"
save_dir = "/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/메이크업"
#os.makedirs(save_dir, exist_ok=True)


# [0] 드라이버 세팅
def setup_driver(headless=True):
    opts = Options()
    if headless:
        opts.add_argument("--headless")
        opts.add_argument("--disable-gpu")
        opts.add_argument("--no-sandbox")
    return webdriver.Chrome(options=opts)

# [1] 탭 클릭
def click_tab(driver, tab_text):
    try:
        a = driver.find_element(By.XPATH, f"//ul[contains(@class,'prd_detail_tab')]//a[text()='{tab_text}']")
        a.click()
        time.sleep(1)
        return True
    except:
        for a in driver.find_elements(By.CSS_SELECTOR, ".prd_detail_tab a"):
            if tab_text in a.text:
                a.click()
                time.sleep(1)
                return True
    return False

# [2] 미디어 파싱
def parse_product_media(html):
    soup = BeautifulSoup(html, "html.parser")
    media_urls = []
    for tag in soup.select("div.detail_area *"):
        src = tag.get("src") or tag.get("srcset")
        if src and src.startswith("https://"):
            media_urls.append(src)
    return media_urls

# [3] 제품 상세 파싱
def parse_product_detail(html):
    soup = BeautifulSoup(html, "html.parser")
    name = soup.select_one("p.prd_name")
    brand = soup.select_one("p.prd_brand a")
    old_price = soup.select_one("span.price-1 strike")
    sale_price = soup.select_one("span.price-2 strong")
    img = soup.select_one("#mainImg")
    ing, origin = "", ""

    artc = soup.find("div", id="artcInfo")
    if artc:
        for dl in artc.select("dl.detail_info_list"):
            dt, dd = dl.find("dt"), dl.find("dd")
            if dt and dd:
                title = dt.get_text(strip=True)
                value = dd.get_text(" ", strip=True)
                if "모든 성분" in title:
                    ing = value
                elif "제조국" in title:
                    origin = value

    options = []
    for li in soup.select("ul.sel_option_list > li[optgoodsinfo]"):
        txt = li.select_one("span.txt").get_text(strip=True)
        code, no = li["optgoodsinfo"].split(":")
        lgc = li.find("input", {"name": "gdasLgcGoodsNo"})["value"]
        options.append({"옵션명": txt, "상품코드": code, "아이템번호": no, "lgcGoodsNo": lgc})

    media_list = parse_product_media(html)

    return {
        "상품명": name.get_text(strip=True) if name else "",
        "브랜드": brand.get_text(strip=True) if brand else "",
        "정가": old_price.get_text(strip=True) if old_price else "",
        "할인가": sale_price.get_text(strip=True) if sale_price else "",
        "이미지": img["src"] if img else "",
        "성분": ing, "제조국": origin,
        "옵션개수": len(options),
        "옵션리스트": options,
        "상세미디어목록": media_list
    }

# [4] 리뷰 파싱
def parse_reviews(html, max_count=10):
    soup = BeautifulSoup(html, "html.parser")
    reviews = []
    poll_data = {}

    ul = soup.find("ul", id="gdasList")
    if not ul:
        return reviews

    for li in ul.find_all("li", recursive=False)[:max_count]:
        reviewer = li.select_one("p.info_user a.id")

        rating_tag = li.select_one("span.point")
        rating, max_score = "", "5.0"

        if rating_tag and 'style' in rating_tag.attrs:
            match = re.search(r'width:(\d+)%', rating_tag['style'])
            if match:
                percent = int(match.group(1))
                rating = str(round((percent / 100) * float(max_score), 1))

        #text = rating_tag.get_text(strip=True) if rating_tag else ""
        #match = re.search(r"(\d+)점만점에\s*(\d+)점", text)
        #max_score = match.group(1) if match else ""

        rating = match.group(2) if match else ""
        score_span = li.select_one("div.score_area span.point")
        date = li.select_one("span.date")
        opt = li.select_one("p.item_option")
        txt = li.select_one("div.txt_inner")
        rec = li.select_one(".recom_area span.num")


        review_data = {
            "리뷰어": reviewer.get_text(strip=True) if reviewer else "",
            "평점": rating,
            "최대평점": max_score,
            "날짜": date.get_text(strip=True) if date else "",
            "옵션": opt.get_text(strip=True).replace("[옵션]", "") if opt else "",
            "본문": txt.get_text(" ", strip=True) if txt else "",
            "추천수": rec.get_text(strip=True) if rec else "0",
            "태그": [s.get_text(strip=True) for s in li.select(".review_tag span")],
            "사용자 피부 정보": [s.get_text(strip=True) for s in li.select("p.tag span")]
        }

        # [설문 블록 정보 추출] (예: 발색력 아주 만족해요 등)
        poll_dl_tags = li.select("div.poll_sample dl.poll_type1")
        for dl in poll_dl_tags:
            dt_tag = dl.select_one("dt span")
            dd_tag = dl.select_one("dd span")
            if dt_tag and dd_tag:
                title = dt_tag.get_text(strip=True)
                value = dd_tag.get_text(strip=True)
                review_data[title] = value

        reviews.append(review_data)
    return reviews

# [5] 상품 + 리뷰 + 설문 크롤링
def crawl_products_and_reviews(urls, max_review_count=1000, headless=True):
    driver = setup_driver(headless)
    product_data, review_data = [], []

    for idx, url in enumerate(urls, 1):
        print(f"[{idx}/{len(urls)}] 크롤링 중: {url}")
        driver.get(url)
        time.sleep(1)

        prod_info = {}
        if click_tab(driver, "구매정보"):
            prod_info = parse_product_detail(driver.page_source)

        if click_tab(driver, "리뷰"):
            time.sleep(1)
            soup = BeautifulSoup(driver.page_source, "html.parser")

            # 피부타입 분포 (발색력, 지속력 등)
            for dl in soup.select("dl.poll_type2.type3"):
                dt = dl.select_one("dt span")
                dd = dl.select_one("dd")
                if dt and dd:
                    title = dt.get_text(strip=True)
                    dist = {}
                    for li in dd.select("li"):
                        label = li.select_one("span")
                        percent = li.select_one("em")
                        if label and percent:
                            dist[label.get_text(strip=True)] = percent.get_text(strip=True)
                    if dist:
                        prod_info[title] = json.dumps({title: dist}, ensure_ascii=False)

            total_reviews, current_page = 0, 1
            while total_reviews < max_review_count:
                time.sleep(1)
                reviews = parse_reviews(driver.page_source, max_count=10)
                if not reviews:
                    break
                for rv in reviews:
                    review_data.append({"상품명": prod_info.get("상품명", ""), "리뷰순번": total_reviews + 1, **rv})
                    total_reviews += 1
                    if total_reviews >= max_review_count:
                        break
                try:
                    next_button = driver.find_element(By.CSS_SELECTOR, f'div.pageing a[data-page-no="{current_page + 1}"]')
                    driver.execute_script("arguments[0].click();", next_button)
                    current_page += 1
                except:
                    break

        product_data.append(prod_info)

    driver.quit()

    product_df = pd.DataFrame(product_data)
    review_df = pd.DataFrame(review_data)

    product_df["상세미디어목록"] = product_df["상세미디어목록"].apply(lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, list) else "[]")
    product_df["옵션리스트"] = product_df["옵션리스트"].apply(lambda x: json.dumps(x, ensure_ascii=False) if x else "[]")
    review_df["태그"] = review_df["태그"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)
    review_df["사용자 피부 정보"] = review_df["사용자 피부 정보"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)

    
    return product_df, review_df

# [6] 실행 (에러 발생 시 이어서 수행)
csv_files = sorted(glob.glob(os.path.join(folder_path, "*_상품URL목록.csv")))

for csv_path in csv_files:
    try:
        category_full = os.path.splitext(os.path.basename(csv_path))[0].replace("_상품URL목록", "")
        print(f"\n▶ 카테고리 시작: {category_full}")

        df_urls = pd.read_csv(csv_path)
        urls = df_urls["url"].dropna().unique().tolist()[:1]

        url_batch_size = 120
        total_batches = len(urls) // url_batch_size + (1 if len(urls) % url_batch_size > 0 else 0)

        for i in range(0, len(urls), url_batch_size):
            url_batch = urls[i:i + url_batch_size]
            url_batch_no = (i // url_batch_size) + 1

            # 이미 저장된 파일이 있다면 건너뜀
            product_path = f"{save_dir}/{category_full}_{url_batch_no}_상품정보.csv"
            review_path = f"{save_dir}/{category_full}_{url_batch_no}_리뷰정보.csv"
            if os.path.exists(product_path) and os.path.exists(review_path):
                print(f"  이미 완료된 Batch {url_batch_no}, 건너뜀.")
                continue

            print(f"  └─ URL Batch {url_batch_no} ({len(url_batch)}개) 크롤링 중...")
            product_df, review_df = crawl_products_and_reviews(url_batch, max_review_count=5, headless=True)

            product_df.to_csv(product_path, index=False, encoding="utf-8-sig")
            review_df.to_csv(review_path, index=False, encoding="utf-8-sig")

            print(f"  저장 완료: {category_full}_{url_batch_no}_상품정보 & 리뷰정보")

    except Exception as e:
        print(f"에러 발생 ({csv_path}): {e}")


▶ 카테고리 시작: 메이크업_립메이크업
  └─ URL Batch 1 (1개) 크롤링 중...
[1/1] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000209460
에러 발생 (/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/메이크업 카테고리/메이크업_립메이크업_상품URL목록.csv): no such group

▶ 카테고리 시작: 메이크업_베이스메이크업
  └─ URL Batch 1 (1개) 크롤링 중...
[1/1] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000208681
에러 발생 (/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/메이크업 카테고리/메이크업_베이스메이크업_상품URL목록.csv): no such group

▶ 카테고리 시작: 메이크업_아이메이크업
  └─ URL Batch 1 (1개) 크롤링 중...
[1/1] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000142375
에러 발생 (/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/메이크업 카테고리/메이크업_아이메이크업_상품URL목록.csv): no such group


In [12]:
import os
import re
import json
import time
import glob
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

folder_path = "/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/메이크업 카테고리"
save_dir = "/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/메이크업"
os.makedirs(save_dir, exist_ok=True)


# [0] 드라이버 세팅
def setup_driver(headless=True):
    opts = Options()
    if headless:
        opts.add_argument("--headless")
        opts.add_argument("--disable-gpu")
        opts.add_argument("--no-sandbox")
    return webdriver.Chrome(options=opts)

# [1] 탭 클릭
def click_tab(driver, tab_text):
    try:
        a = driver.find_element(By.XPATH, f"//ul[contains(@class,'prd_detail_tab')]//a[text()='{tab_text}']")
        a.click()
        time.sleep(1)
        return True
    except:
        for a in driver.find_elements(By.CSS_SELECTOR, ".prd_detail_tab a"):
            if tab_text in a.text:
                a.click()
                time.sleep(1)
                return True
    return False

# [2] 미디어 파싱
def parse_product_media(html):
    soup = BeautifulSoup(html, "html.parser")
    media_urls = []
    for tag in soup.select("div.detail_area *"):
        src = tag.get("src") or tag.get("srcset")
        if src and src.startswith("https://"):
            media_urls.append(src)
    return media_urls

# [3] 제품 상세 파싱
def parse_product_detail(html):
    soup = BeautifulSoup(html, "html.parser")
    name = soup.select_one("p.prd_name")
    brand = soup.select_one("p.prd_brand a")
    old_price = soup.select_one("span.price-1 strike")
    sale_price = soup.select_one("span.price-2 strong")
    img = soup.select_one("#mainImg")
    ing, origin = "", ""

    artc = soup.find("div", id="artcInfo")
    if artc:
        for dl in artc.select("dl.detail_info_list"):
            dt, dd = dl.find("dt"), dl.find("dd")
            if dt and dd:
                title = dt.get_text(strip=True)
                value = dd.get_text(" ", strip=True)
                if "모든 성분" in title:
                    ing = value
                elif "제조국" in title:
                    origin = value

    options = []
    for li in soup.select("ul.sel_option_list > li[optgoodsinfo]"):
        txt = li.select_one("span.txt").get_text(strip=True)
        code, no = li["optgoodsinfo"].split(":")
        lgc = li.find("input", {"name": "gdasLgcGoodsNo"})["value"]
        options.append({"옵션명": txt, "상품코드": code, "아이템번호": no, "lgcGoodsNo": lgc})

    media_list = parse_product_media(html)

    return {
        "상품명": name.get_text(strip=True) if name else "",
        "브랜드": brand.get_text(strip=True) if brand else "",
        "정가": old_price.get_text(strip=True) if old_price else "",
        "할인가": sale_price.get_text(strip=True) if sale_price else "",
        "이미지": img["src"] if img else "",
        "성분": ing, "제조국": origin,
        "옵션개수": len(options),
        "옵션리스트": options,
        "상세미디어목록": media_list
    }

# [4] 리뷰 파싱
def parse_reviews(html, max_count=10):
    soup = BeautifulSoup(html, "html.parser")
    reviews = []
    poll_data = {}

    ul = soup.find("ul", id="gdasList")
    if not ul:
        return reviews

    for li in ul.find_all("li", recursive=False)[:max_count]:
        reviewer = li.select_one("p.info_user a.id")
        rating = li.select_one("span.point")
        score_span = li.select_one("div.score_area span.point")
        date = li.select_one("span.date")
        opt = li.select_one("p.item_option")
        txt = li.select_one("div.txt_inner")
        rec = li.select_one(".recom_area span.num")

        review_data = {
            "리뷰어": reviewer.get_text(strip=True) if reviewer else "",
            "평점": re.search(r"([\d\.]+)점", rating.get_text(strip=True)).group(1) if rating else "",
            "최대평점": re.search(r"(\d+)점만점에", score_span.get_text(strip=True)).group(1) if score_span else "",
            "날짜": date.get_text(strip=True) if date else "",
            "옵션": opt.get_text(strip=True).replace("[옵션]", "") if opt else "",
            "본문": txt.get_text(" ", strip=True) if txt else "",
            "추천수": rec.get_text(strip=True) if rec else "0",
            "태그": [s.get_text(strip=True) for s in li.select(".review_tag span")],
            "사용자 피부 정보": [s.get_text(strip=True) for s in li.select("p.tag span")]
        }

        # [설문 블록 정보 추출] (예: 발색력 아주 만족해요 등)
        poll_dl_tags = li.select("div.poll_sample dl.poll_type1")
        for dl in poll_dl_tags:
            dt_tag = dl.select_one("dt span")
            dd_tag = dl.select_one("dd span")
            if dt_tag and dd_tag:
                title = dt_tag.get_text(strip=True)
                value = dd_tag.get_text(strip=True)
                review_data[title] = value

        reviews.append(review_data)
    return reviews

# [5] 상품 + 리뷰 + 설문 크롤링
def crawl_products_and_reviews(urls, max_review_count=1000, headless=True):
    driver = setup_driver(headless)
    product_data, review_data = [], []

    for idx, url in enumerate(urls, 1):
        print(f"[{idx}/{len(urls)}] 크롤링 중: {url}")
        driver.get(url)
        time.sleep(1)

        prod_info = {}
        if click_tab(driver, "구매정보"):
            prod_info = parse_product_detail(driver.page_source)

        if click_tab(driver, "리뷰"):
            time.sleep(1)
            soup = BeautifulSoup(driver.page_source, "html.parser")

            # 피부타입 분포 (발색력, 지속력 등)
            for dl in soup.select("dl.poll_type2.type3"):
                dt = dl.select_one("dt span")
                dd = dl.select_one("dd")
                if dt and dd:
                    title = dt.get_text(strip=True)
                    dist = {}
                    for li in dd.select("li"):
                        label = li.select_one("span")
                        percent = li.select_one("em")
                        if label and percent:
                            dist[label.get_text(strip=True)] = percent.get_text(strip=True)
                    if dist:
                        prod_info[title] = json.dumps({title: dist}, ensure_ascii=False)

            total_reviews, current_page = 0, 1
            while total_reviews < max_review_count:
                time.sleep(1)
                reviews = parse_reviews(driver.page_source, max_count=10)
                if not reviews:
                    break
                for rv in reviews:
                    review_data.append({"상품명": prod_info.get("상품명", ""), "리뷰순번": total_reviews + 1, **rv})
                    total_reviews += 1
                    if total_reviews >= max_review_count:
                        break
                try:
                    next_button = driver.find_element(By.CSS_SELECTOR, f'div.pageing a[data-page-no="{current_page + 1}"]')
                    driver.execute_script("arguments[0].click();", next_button)
                    current_page += 1
                except:
                    break

        product_data.append(prod_info)

    driver.quit()

    product_df = pd.DataFrame(product_data)
    review_df = pd.DataFrame(review_data)

    product_df["상세미디어목록"] = product_df["상세미디어목록"].apply(lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, list) else "[]")
    product_df["옵션리스트"] = product_df["옵션리스트"].apply(lambda x: json.dumps(x, ensure_ascii=False) if x else "[]")
    review_df["태그"] = review_df["태그"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)
    review_df["사용자 피부 정보"] = review_df["사용자 피부 정보"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)

    
    return product_df, review_df

# [6] 실행 (에러 발생 시 이어서 수행)
csv_files = sorted(glob.glob(os.path.join(folder_path, "*_상품URL목록.csv")))

for csv_path in csv_files:
    try:
        category_full = os.path.splitext(os.path.basename(csv_path))[0].replace("_상품URL목록", "")
        print(f"\n▶ 카테고리 시작: {category_full}")

        df_urls = pd.read_csv(csv_path)
        urls = df_urls["url"].dropna().unique().tolist()[:1]

        url_batch_size = 120
        total_batches = len(urls) // url_batch_size + (1 if len(urls) % url_batch_size > 0 else 0)

        for i in range(0, len(urls), url_batch_size):
            url_batch = urls[i:i + url_batch_size]
            url_batch_no = (i // url_batch_size) + 1

            # 이미 저장된 파일이 있다면 건너뜀
            product_path = f"{save_dir}/{category_full}_{url_batch_no}_상품정보.csv"
            review_path = f"{save_dir}/{category_full}_{url_batch_no}_리뷰정보.csv"
            if os.path.exists(product_path) and os.path.exists(review_path):
                print(f"  이미 완료된 Batch {url_batch_no}, 건너뜀.")
                continue

            print(f"  └─ URL Batch {url_batch_no} ({len(url_batch)}개) 크롤링 중...")
            product_df, review_df = crawl_products_and_reviews(url_batch, max_review_count=5, headless=True)

            product_df.to_csv(product_path, index=False, encoding="utf-8-sig")
            review_df.to_csv(review_path, index=False, encoding="utf-8-sig")

            print(f"  저장 완료: {category_full}_{url_batch_no}_상품정보 & 리뷰정보")

    except Exception as e:
        print(f"에러 발생 ({csv_path}): {e}")



▶ 카테고리 시작: 메이크업_립메이크업
  └─ URL Batch 1 (1개) 크롤링 중...
[1/1] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000209460
  저장 완료: 메이크업_립메이크업_1_상품정보 & 리뷰정보

▶ 카테고리 시작: 메이크업_베이스메이크업
  └─ URL Batch 1 (1개) 크롤링 중...
[1/1] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000208681
  저장 완료: 메이크업_베이스메이크업_1_상품정보 & 리뷰정보

▶ 카테고리 시작: 메이크업_아이메이크업
  └─ URL Batch 1 (1개) 크롤링 중...
[1/1] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000142375
  저장 완료: 메이크업_아이메이크업_1_상품정보 & 리뷰정보


---

In [13]:
import os
import re
import json
import time
import glob
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

folder_path = "/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/메이크업 카테고리"
save_dir = "/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/메이크업"
os.makedirs(save_dir, exist_ok=True)


# [0] 드라이버 세팅
def setup_driver(headless=True):
    opts = Options()
    if headless:
        opts.add_argument("--headless")
        opts.add_argument("--disable-gpu")
        opts.add_argument("--no-sandbox")
    return webdriver.Chrome(options=opts)

# [1] 탭 클릭
def click_tab(driver, tab_text):
    try:
        a = driver.find_element(By.XPATH, f"//ul[contains(@class,'prd_detail_tab')]//a[text()='{tab_text}']")
        a.click()
        time.sleep(1)
        return True
    except:
        for a in driver.find_elements(By.CSS_SELECTOR, ".prd_detail_tab a"):
            if tab_text in a.text:
                a.click()
                time.sleep(1)
                return True
    return False

# [2] 미디어 파싱
def parse_product_media(html):
    soup = BeautifulSoup(html, "html.parser")
    media_urls = []
    for tag in soup.select("div.detail_area *"):
        src = tag.get("src") or tag.get("srcset")
        if src and src.startswith("https://"):
            media_urls.append(src)
    return media_urls

# [3] 제품 상세 파싱
def parse_product_detail(html):
    soup = BeautifulSoup(html, "html.parser")
    name = soup.select_one("p.prd_name")
    brand = soup.select_one("p.prd_brand a")
    old_price = soup.select_one("span.price-1 strike")
    sale_price = soup.select_one("span.price-2 strong")
    img = soup.select_one("#mainImg")
    ing, origin = "", ""

    artc = soup.find("div", id="artcInfo")
    if artc:
        for dl in artc.select("dl.detail_info_list"):
            dt, dd = dl.find("dt"), dl.find("dd")
            if dt and dd:
                title = dt.get_text(strip=True)
                value = dd.get_text(" ", strip=True)
                if "모든 성분" in title:
                    ing = value
                elif "제조국" in title:
                    origin = value

    options = []
    for li in soup.select("ul.sel_option_list > li[optgoodsinfo]"):
        txt = li.select_one("span.txt").get_text(strip=True)
        code, no = li["optgoodsinfo"].split(":")
        lgc = li.find("input", {"name": "gdasLgcGoodsNo"})["value"]
        options.append({"옵션명": txt, "상품코드": code, "아이템번호": no, "lgcGoodsNo": lgc})

    media_list = parse_product_media(html)

    return {
        "상품명": name.get_text(strip=True) if name else "",
        "브랜드": brand.get_text(strip=True) if brand else "",
        "정가": old_price.get_text(strip=True) if old_price else "",
        "할인가": sale_price.get_text(strip=True) if sale_price else "",
        "이미지": img["src"] if img else "",
        "성분": ing, "제조국": origin,
        "옵션개수": len(options),
        "옵션리스트": options,
        "상세미디어목록": media_list
    }

# [4] 리뷰 파싱
def parse_reviews(html, max_count=10):
    soup = BeautifulSoup(html, "html.parser")
    reviews = []
    poll_data = {}

    ul = soup.find("ul", id="gdasList")
    if not ul:
        return reviews

    for li in ul.find_all("li", recursive=False)[:max_count]:
        reviewer = li.select_one("p.info_user a.id")
        rating = li.select_one("span.point")
        score_span = li.select_one("div.score_area span.point")
        date = li.select_one("span.date")
        opt = li.select_one("p.item_option")
        txt = li.select_one("div.txt_inner")
        rec = li.select_one(".recom_area span.num")

        review_data = {
            "리뷰어": reviewer.get_text(strip=True) if reviewer else "",
            "평점": re.search(r"([\d\.]+)점", rating.get_text(strip=True)).group(1) if rating else "",
            "최대평점": re.search(r"(\d+)점만점에", score_span.get_text(strip=True)).group(1) if score_span else "",
            "날짜": date.get_text(strip=True) if date else "",
            "옵션": opt.get_text(strip=True).replace("[옵션]", "") if opt else "",
            "본문": txt.get_text(" ", strip=True) if txt else "",
            "추천수": rec.get_text(strip=True) if rec else "0",
            "태그": [s.get_text(strip=True) for s in li.select(".review_tag span")],
            "사용자 피부 정보": [s.get_text(strip=True) for s in li.select("p.tag span")]
        }

        # [설문 블록 정보 추출] (예: 발색력 아주 만족해요 등)
        poll_dl_tags = li.select("div.poll_sample dl.poll_type1")
        for dl in poll_dl_tags:
            dt_tag = dl.select_one("dt span")
            dd_tag = dl.select_one("dd span")
            if dt_tag and dd_tag:
                title = dt_tag.get_text(strip=True)
                value = dd_tag.get_text(strip=True)
                review_data[title] = value

        reviews.append(review_data)
    return reviews

# [5] 상품 + 리뷰 + 설문 크롤링
def crawl_products_and_reviews(urls, max_review_count=1000, headless=True):
    driver = setup_driver(headless)
    product_data, review_data = [], []

    for idx, url in enumerate(urls, 1):
        print(f"[{idx}/{len(urls)}] 크롤링 중: {url}")
        driver.get(url)
        time.sleep(1)

        prod_info = {}
        if click_tab(driver, "구매정보"):
            prod_info = parse_product_detail(driver.page_source)

        if click_tab(driver, "리뷰"):
            time.sleep(1)
            soup = BeautifulSoup(driver.page_source, "html.parser")

        # ============================================# 
            # 피부타입 분포 (발색력, 지속력 등)
            # [✔] 상단 요약 설문 블록 추출 (예: 발색력, 가루날림 등)
            poll_summary_div = soup.select_one("div.poll_all.clrfix")
            if poll_summary_div:
                for dl in poll_summary_div.select("dl.poll_type2"):
                    dt_tag = dl.select_one("dt span")
                    dd_tag = dl.select_one("dd")
                    if dt_tag and dd_tag:
                        title = dt_tag.get_text(strip=True)
                        dist = {}
                        for li in dd_tag.select("li"):
                            label = li.select_one("span")
                            percent = li.select_one("em")
                            if label and percent:
                                dist[label.get_text(strip=True)] = percent.get_text(strip=True)
                        if dist:
                            prod_info[title] = json.dumps({title: dist}, ensure_ascii=False)

            total_reviews, current_page = 0, 1
            while total_reviews < max_review_count:
                time.sleep(1)
                reviews = parse_reviews(driver.page_source, max_count=10)
                if not reviews:
                    break
                for rv in reviews:
                    review_data.append({"상품명": prod_info.get("상품명", ""), "리뷰순번": total_reviews + 1, **rv})
                    total_reviews += 1
                    if total_reviews >= max_review_count:
                        break
                try:
                    next_button = driver.find_element(By.CSS_SELECTOR, f'div.pageing a[data-page-no="{current_page + 1}"]')
                    driver.execute_script("arguments[0].click();", next_button)
                    current_page += 1
                except:
                    break

        product_data.append(prod_info)

    driver.quit()

    product_df = pd.DataFrame(product_data)
    review_df = pd.DataFrame(review_data)

    product_df["상세미디어목록"] = product_df["상세미디어목록"].apply(lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, list) else "[]")
    product_df["옵션리스트"] = product_df["옵션리스트"].apply(lambda x: json.dumps(x, ensure_ascii=False) if x else "[]")
    review_df["태그"] = review_df["태그"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)
    review_df["사용자 피부 정보"] = review_df["사용자 피부 정보"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)

    
    return product_df, review_df

# [6] 실행 (에러 발생 시 이어서 수행)
csv_files = sorted(glob.glob(os.path.join(folder_path, "*_상품URL목록.csv")))

for csv_path in csv_files:
    try:
        category_full = os.path.splitext(os.path.basename(csv_path))[0].replace("_상품URL목록", "")
        print(f"\n▶ 카테고리 시작: {category_full}")

        df_urls = pd.read_csv(csv_path)
        urls = df_urls["url"].dropna().unique().tolist()[:1]

        url_batch_size = 120
        total_batches = len(urls) // url_batch_size + (1 if len(urls) % url_batch_size > 0 else 0)

        for i in range(0, len(urls), url_batch_size):
            url_batch = urls[i:i + url_batch_size]
            url_batch_no = (i // url_batch_size) + 1

            # 이미 저장된 파일이 있다면 건너뜀
            product_path = f"{save_dir}/{category_full}_{url_batch_no}_상품정보.csv"
            review_path = f"{save_dir}/{category_full}_{url_batch_no}_리뷰정보.csv"
            if os.path.exists(product_path) and os.path.exists(review_path):
                print(f"  이미 완료된 Batch {url_batch_no}, 건너뜀.")
                continue

            print(f"  └─ URL Batch {url_batch_no} ({len(url_batch)}개) 크롤링 중...")
            product_df, review_df = crawl_products_and_reviews(url_batch, max_review_count=5, headless=True)

            product_df.to_csv(product_path, index=False, encoding="utf-8-sig")
            review_df.to_csv(review_path, index=False, encoding="utf-8-sig")

            print(f"  저장 완료: {category_full}_{url_batch_no}_상품정보 & 리뷰정보")

    except Exception as e:
        print(f"에러 발생 ({csv_path}): {e}")



▶ 카테고리 시작: 메이크업_립메이크업
  └─ URL Batch 1 (1개) 크롤링 중...
[1/1] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000209460
  저장 완료: 메이크업_립메이크업_1_상품정보 & 리뷰정보

▶ 카테고리 시작: 메이크업_베이스메이크업
  └─ URL Batch 1 (1개) 크롤링 중...
[1/1] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000208681
  저장 완료: 메이크업_베이스메이크업_1_상품정보 & 리뷰정보

▶ 카테고리 시작: 메이크업_아이메이크업
  └─ URL Batch 1 (1개) 크롤링 중...
[1/1] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000142375
  저장 완료: 메이크업_아이메이크업_1_상품정보 & 리뷰정보


# 별점 문제

In [21]:
import os
import re
import json
import time
import glob
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

folder_path = "/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/메이크업 카테고리"
save_dir = "/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/메이크업"
os.makedirs(save_dir, exist_ok=True)


# [0] 드라이버 세팅
def setup_driver(headless=True):
    opts = Options()
    if headless:
        opts.add_argument("--headless")
        opts.add_argument("--disable-gpu")
        opts.add_argument("--no-sandbox")
    return webdriver.Chrome(options=opts)

# [1] 탭 클릭
def click_tab(driver, tab_text):
    try:
        a = driver.find_element(By.XPATH, f"//ul[contains(@class,'prd_detail_tab')]//a[text()='{tab_text}']")
        a.click()
        time.sleep(1)
        return True
    except:
        for a in driver.find_elements(By.CSS_SELECTOR, ".prd_detail_tab a"):
            if tab_text in a.text:
                a.click()
                time.sleep(1)
                return True
    return False

# [2] 미디어 파싱
def parse_product_media(html):
    soup = BeautifulSoup(html, "html.parser")
    media_urls = []
    for tag in soup.select("div.detail_area *"):
        src = tag.get("src") or tag.get("srcset")
        if src and src.startswith("https://"):
            media_urls.append(src)
    return media_urls

# [3] 제품 상세 파싱
def parse_product_detail(html):
    soup = BeautifulSoup(html, "html.parser")
    name = soup.select_one("p.prd_name")
    brand = soup.select_one("p.prd_brand a")
    old_price = soup.select_one("span.price-1 strike")
    sale_price = soup.select_one("span.price-2 strong")
    img = soup.select_one("#mainImg")
    ing, origin = "", ""

    artc = soup.find("div", id="artcInfo")
    if artc:
        for dl in artc.select("dl.detail_info_list"):
            dt, dd = dl.find("dt"), dl.find("dd")
            if dt and dd:
                title = dt.get_text(strip=True)
                value = dd.get_text(" ", strip=True)
                if "모든 성분" in title:
                    ing = value
                elif "제조국" in title:
                    origin = value

    options = []
    for li in soup.select("ul.sel_option_list > li[optgoodsinfo]"):
        txt = li.select_one("span.txt").get_text(strip=True)
        code, no = li["optgoodsinfo"].split(":")
        lgc = li.find("input", {"name": "gdasLgcGoodsNo"})["value"]
        options.append({"옵션명": txt, "상품코드": code, "아이템번호": no, "lgcGoodsNo": lgc})

    media_list = parse_product_media(html)

    return {
        "상품명": name.get_text(strip=True) if name else "",
        "브랜드": brand.get_text(strip=True) if brand else "",
        "정가": old_price.get_text(strip=True) if old_price else "",
        "할인가": sale_price.get_text(strip=True) if sale_price else "",
        "이미지": img["src"] if img else "",
        "성분": ing, "제조국": origin,
        "옵션개수": len(options),
        "옵션리스트": options,
        "상세미디어목록": media_list
    }

# [4] 리뷰 파싱
def parse_reviews(html, max_count=10):
    soup = BeautifulSoup(html, "html.parser")
    reviews = []
    poll_data = {}

    ul = soup.find("ul", id="gdasList")
    if not ul:
        return reviews

    for li in ul.find_all("li", recursive=False)[:max_count]:
        reviewer = li.select_one("p.info_user a.id")
        rating = li.select_one("span.point")
        score_span = li.select_one("div.score_area span.point")
        date = li.select_one("span.date")
        opt = li.select_one("p.item_option")
        txt = li.select_one("div.txt_inner")
        rec = li.select_one(".recom_area span.num")

        rating_tag = li.select_one("span.point")
        rating_text = rating_tag.get_text(strip=True) if rating_tag else ""

        # 기본값 설정
        rating = ""
        max_score = ""

        # 평점 텍스트에서 정규식 추출
        match = re.search(r"(\d+)점만점에\s*(\d+)점", rating_text)
        if match:
            max_score = match.group(1)
            rating = match.group(2)


        review_data = {
            "리뷰어": reviewer.get_text(strip=True) if reviewer else "",
            "평점": rating,
            "최대평점": max_score,
            "날짜": date.get_text(strip=True) if date else "",
            "옵션": opt.get_text(strip=True).replace("[옵션]", "") if opt else "",
            "본문": txt.get_text(" ", strip=True) if txt else "",
            "추천수": rec.get_text(strip=True) if rec else "0",
            "태그": [s.get_text(strip=True) for s in li.select(".review_tag span")],
            "사용자 피부 정보": [s.get_text(strip=True) for s in li.select("p.tag span")]
        }

        # [설문 블록 정보 추출] (예: 발색력 아주 만족해요 등)
        poll_dl_tags = li.select("div.poll_sample dl.poll_type1")
        for dl in poll_dl_tags:
            dt_tag = dl.select_one("dt span")
            dd_tag = dl.select_one("dd span")
            if dt_tag and dd_tag:
                title = dt_tag.get_text(strip=True)
                value = dd_tag.get_text(strip=True)
                review_data[title] = value

        reviews.append(review_data)
    return reviews

# [5] 상품 + 리뷰 + 설문 크롤링
def crawl_products_and_reviews(urls, max_review_count=1000, headless=True):
    driver = setup_driver(headless)
    product_data, review_data = [], []

    for idx, url in enumerate(urls, 1):
        print(f"[{idx}/{len(urls)}] 크롤링 중: {url}")
        driver.get(url)
        time.sleep(1)

        prod_info = {}
        if click_tab(driver, "구매정보"):
            prod_info = parse_product_detail(driver.page_source)

        if click_tab(driver, "리뷰"):
            time.sleep(1)
            soup = BeautifulSoup(driver.page_source, "html.parser")

        # ============================================# 
            # 피부타입 분포 (발색력, 지속력 등)
            # [✔] 상단 요약 설문 블록 추출 (예: 발색력, 가루날림 등)
            poll_summary_div = soup.select_one("div.poll_all.clrfix")
            if poll_summary_div:
                for dl in poll_summary_div.select("dl.poll_type2"):
                    dt_tag = dl.select_one("dt span")
                    dd_tag = dl.select_one("dd")
                    if dt_tag and dd_tag:
                        title = dt_tag.get_text(strip=True)
                        dist = {}
                        for li in dd_tag.select("li"):
                            label = li.select_one("span")
                            percent = li.select_one("em")
                            if label and percent:
                                dist[label.get_text(strip=True)] = percent.get_text(strip=True)
                        if dist:
                            prod_info[title] = json.dumps({title: dist}, ensure_ascii=False)

            total_reviews, current_page = 0, 1
            while total_reviews < max_review_count:
                time.sleep(1)
                reviews = parse_reviews(driver.page_source, max_count=10)
                if not reviews:
                    break
                for rv in reviews:
                    review_data.append({"상품명": prod_info.get("상품명", ""), "리뷰순번": total_reviews + 1, **rv})
                    total_reviews += 1
                    if total_reviews >= max_review_count:
                        break
                try:
                    next_button = driver.find_element(By.CSS_SELECTOR, f'div.pageing a[data-page-no="{current_page + 1}"]')
                    driver.execute_script("arguments[0].click();", next_button)
                    current_page += 1
                except:
                    break

        product_data.append(prod_info)

    driver.quit()

    product_df = pd.DataFrame(product_data)
    review_df = pd.DataFrame(review_data)

    product_df["상세미디어목록"] = product_df["상세미디어목록"].apply(lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, list) else "[]")
    product_df["옵션리스트"] = product_df["옵션리스트"].apply(lambda x: json.dumps(x, ensure_ascii=False) if x else "[]")
    review_df["태그"] = review_df["태그"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)
    review_df["사용자 피부 정보"] = review_df["사용자 피부 정보"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)

    
    return product_df, review_df

# [6] 실행 (에러 발생 시 이어서 수행)
csv_files = sorted(glob.glob(os.path.join(folder_path, "*_상품URL목록.csv")))

for csv_path in csv_files:
    try:
        category_full = os.path.splitext(os.path.basename(csv_path))[0].replace("_상품URL목록", "")
        print(f"\n▶ 카테고리 시작: {category_full}")

        df_urls = pd.read_csv(csv_path)
        urls = df_urls["url"].dropna().unique().tolist()

        url_batch_size = 100
        total_batches = len(urls) // url_batch_size + (1 if len(urls) % url_batch_size > 0 else 0)

        for i in range(0, len(urls), url_batch_size):
            url_batch = urls[i:i + url_batch_size]
            url_batch_no = (i // url_batch_size) + 1

            # 이미 저장된 파일이 있다면 건너뜀
            product_path = f"{save_dir}/{category_full}_{url_batch_no}_상품정보.csv"
            review_path = f"{save_dir}/{category_full}_{url_batch_no}_리뷰정보.csv"
            if os.path.exists(product_path) and os.path.exists(review_path):
                print(f"  이미 완료된 Batch {url_batch_no}, 건너뜀.")
                continue

            print(f"  └─ URL Batch {url_batch_no} ({len(url_batch)}개) 크롤링 중...")
            product_df, review_df = crawl_products_and_reviews(url_batch, max_review_count=5, headless=True)

            product_df.to_csv(product_path, index=False, encoding="utf-8-sig")
            review_df.to_csv(review_path, index=False, encoding="utf-8-sig")

            print(f"  저장 완료: {category_full}_{url_batch_no}_상품정보 & 리뷰정보")

    except Exception as e:
        print(f"에러 발생 ({csv_path}): {e}")



▶ 카테고리 시작: 메이크업_립메이크업
  └─ URL Batch 1 (3개) 크롤링 중...
[1/3] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000209460
에러 발생 (/Users/dayeon/dayeoncode/kite/한경토스_강의자료/최종프로젝트/메이크업 카테고리/메이크업_립메이크업_상품URL목록.csv): ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

▶ 카테고리 시작: 메이크업_베이스메이크업
  └─ URL Batch 1 (3개) 크롤링 중...
[1/3] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000208681
[2/3] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000162000
[3/3] 크롤링 중: https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000186913


KeyboardInterrupt: 