In [2]:
import time
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json

ITEMS_COUNT = 100

In [3]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
    
}

timeout_settings = (60, 60)

In [4]:
def get_response(url, headers):

    # GET 요청
    response = requests.get(url, headers=headers, timeout=timeout_settings)
    soup = BeautifulSoup(response.text, 'lxml')
    time.sleep(0.5)

    return soup


In [5]:
def get_small_categories():
    url = 'https://pcw.thehandsome.com/api/display/1/ko/category/categoryList?dispCtgNo=mainDispCtgNo'

    soup = get_response(url, headers)
    info = soup.string
    categories = json.loads(info)['payload']['lrgCtgList']
    category_index = []
    gender_type = ['여성', '남성']
    for i, gender in enumerate(gender_type):
        for j in range(1, len(categories[i]['midCtgList'])):
            for k in range(1, len(categories[i]['midCtgList'][j]['smlCtgList'])):
                small_category_num = categories[i]['midCtgList'][j]['smlCtgList'][k]['dispCtgNo']
                category_index.append(small_category_num)

    return category_index


In [6]:
def get_item_info(goods):
    # 상품 id
    goodsNo = goods['goodsNo']

    # 리뷰 갯수
    goodsRevCnt = goods['goodsRevCnt']

    return [goodsNo, goodsRevCnt]

In [7]:
def get_items(num):
    url = f'https://www.thehandsome.com/api/display/1/ko/category/categoryGoodsList?dispMediaCd=10&sortGbn=20&pageSize={ITEMS_COUNT}&pageNo=1&norOutletGbCd=J&dispCtgNo={num}&productListLayout=4&theditedYn=N'

    goods_info = []

    soup = get_response(url, headers)
    info = soup.string
    goods_in_page = json.loads(info)['payload']['goodsList']

    for goods in goods_in_page:
        
        goods_data = get_item_info(goods)
        goods_info.append(goods_data)
        print(len(goods_info), goods_data)

    return goods_info

In [8]:
def get_one_review(review):
    
    # 상품 id
    goodsNo = review['goodsNo']

    # 리뷰 id
    review_id = review['revNo']

    # 별 개수
    rating = review['revScrVal']

    # 작성일
    written_date = review['revWrtDtm']

    # 유저 아이디
    user_id = review['loginId']

    # 내용
    revCont = review['revCont']

    # 구매 색상
    color = review['goodsClorNm']

    # 구매 사이즈
    size = review['goodsSzNm']

    # 구매 sku
    product_sku = {'color': color, 'size': size}

    # 수입처
    import_source = review['shopNm']

    # 키
    try:
        height = review['revPrfleList'][0]['mbrPrfleValNm']
    except:
        height = None
    
    # 평소 사이즈
    try:
        nor_size = review['revPrfleList'][1]['mbrPrfleValNm']
    except:
        nor_size = None

    

    data = {
        'product_id': goodsNo,
        'review_id': review_id,
        'rating': rating,
        'written_date': written_date,
        'user_id': user_id,
        'body': revCont,
        'product_sku': product_sku,
        'import_source': import_source,
        'user_height': height,
        'user_weight': nor_size,
        'rating': rating,

    }

    return data

In [9]:
def get_review_data(goodsNo, goodsRevCnt):
    url = f'https://www.thehandsome.com/api/goods/1/ko/goods/{goodsNo}/reviews?sortTypeCd=latest&revGbCd=&pageSize={goodsRevCnt}&pageNo=1'

    one_goods_reviews = []

    soup = get_response(url, headers)
    reviews = soup.string

    try:
        reviews = json.loads(reviews)['payload']['revAllList']

        for review in reviews:
            one_review = get_one_review(review)
            one_goods_reviews.append(one_review)
    except:
        pass
    
    print(f'{goodsNo} 상품 리뷰 수: {len(one_goods_reviews)}', one_goods_reviews)


    return one_goods_reviews

    

In [10]:
def main():
    category_index = get_small_categories()

    all_info = []
    for num in category_index:
        middle_category_info = get_items(num)
        all_info += middle_category_info

    all_reviews = []

    for goodsNo, goodsRevCnt in all_info:
        one_goods_reviews = get_review_data(goodsNo, goodsRevCnt)
        all_reviews += one_goods_reviews


In [11]:
if __name__ == '__main__':
    main()  

1 ['CM2E3WJC158WP1', 10]
2 ['SY2E1WOT718W', 6]
3 ['TM2E1WJC823WP6', 14]
4 ['MN2E4WJC467W', 8]
5 ['MN2E5WOT584W', 6]
6 ['LC2E4WJC965W', 2]
7 ['MN2E5WJC482W', 0]
8 ['SJ2E4WJC232W', 0]
9 ['SY2E4WJC515W', 1]
10 ['TM2E3NOT950WP1', 0]
11 ['CM2E5KOT539W', 1]
12 ['TM2E4WOTB61WM1', 0]
13 ['LC2E4WJC967W', 0]
14 ['LC2E4WOT466W', 10]
15 ['TM2E3WJC842W', 40]
16 ['LC2E3WOT449W', 13]
17 ['SJ2E1WOTQ04W', 50]
18 ['SY2E1NOT718W', 82]
19 ['SY2E3WOT614W', 3]
20 ['CM2E3WOT126W', 10]
21 ['MN2E5WOT581W', 0]
22 ['MN2E3WJC449W', 4]
23 ['TM2E1WJC823WP3', 2]
24 ['TM2E4WJC863WP1', 3]
25 ['CM2E1NOT103WP1', 20]
26 ['SY2E4WOT619W', 1]
27 ['TM2E1WJC823WP8', 1]
28 ['CM2E4WOT138W', 6]
29 ['IL2E4WJC576W', 1]
30 ['MN2E3WJC445W', 1]
31 ['MN2E3WOT544W', 2]
32 ['MN2E4WJC465W', 4]
33 ['MN2E4WOT563WP1', 4]
34 ['MN2E5WJC481W', 0]
35 ['SJ2E4WOT784W', 0]
36 ['SY2E3KOT410W', 0]
37 ['SY2E3WJS562WP1', 0]
38 ['SY2E3WOT612W', 7]
39 ['TM2E1NOT928W', 20]
40 ['TM2E1WJC821WP5', 1]
41 ['TM2E1WJC825W', 14]
42 ['TM2E3NOT950W', 35]
43 ['TM2E