In [1]:
import time
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json

ITEMS_COUNT = 100

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
    
}

timeout_settings = (60, 60)

In [3]:
middle_category_nums = ['367', '596', '2074', '595', '980', '981', 
                        '1477', '657', '1004', '2080', '1003', '1002',
                        '2224']

In [4]:
def get_response(url, headers):

    # GET 요청
    response = requests.get(url, headers=headers, timeout=timeout_settings)
    soup = BeautifulSoup(response.text, 'lxml')
    time.sleep(0.5)

    return soup


In [5]:
def get_item_info(goods):
    # 상품 id
    goodsNo = goods['goodsNo']

    # 리뷰 갯수
    goodsRevCnt = goods['goodsRevCnt']

    return [goodsNo, goodsRevCnt]

In [6]:
def get_items(num):
    url = f'https://www.thehandsome.com/api/display/1/ko/category/categoryGoodsList?dispMediaCd=10&sortGbn=20&pageSize={ITEMS_COUNT}&pageNo=1&norOutletGbCd=J&dispCtgNo={num}&productListLayout=4&theditedYn=N'

    goods_info = []

    soup = get_response(url, headers)
    info = soup.string
    goods_in_page = json.loads(info)['payload']['goodsList']

    for goods in goods_in_page:
        
        goods_data = get_item_info(goods)
        goods_info.append(goods_data)
        print(len(goods_info), goods_data)

    return goods_info

In [7]:
def get_one_review(review):
    
    # 상품 id
    goodsNo = review['goodsNo']

    # 리뷰 id
    review_id = review['revNo']

    # 별 개수
    rating = review['revScrVal']

    # 작성일
    written_date = review['revWrtDtm']

    # 유저 아이디
    user_id = review['loginId']

    # 내용
    revCont = review['revCont']

    # 구매 색상
    color = review['goodsClorNm']

    # 구매 사이즈
    size = review['goodsSzNm']

    # 구매 sku
    product_sku = {'color': color, 'size': size}

    # 수입처
    import_source = review['shopNm']

    # 키
    try:
        height = review['revPrfleList'][0]['mbrPrfleValNm']
    except:
        height = None
    
    # 평소 사이즈
    try:
        nor_size = review['revPrfleList'][1]['mbrPrfleValNm']
    except:
        nor_size = None

    

    data = {
        'product_id': goodsNo,
        'review_id': review_id,
        'rating': rating,
        'written_date': written_date,
        'user_id': user_id,
        'body': revCont,
        'product_sku': product_sku,
        'import_source': import_source,
        'user_height': height,
        'user_weight': nor_size,
        'rating': rating,

    }

    return data

In [8]:
def get_review_data(goodsNo, goodsRevCnt):
    url = f'https://www.thehandsome.com/api/goods/1/ko/goods/{goodsNo}/reviews?sortTypeCd=latest&revGbCd=&pageSize={goodsRevCnt}&pageNo=1'

    one_goods_reviews = []

    soup = get_response(url, headers)
    reviews = soup.string

    try:
        reviews = json.loads(reviews)['payload']['revAllList']

        for review in reviews:
            one_review = get_one_review(review)
            one_goods_reviews.append(one_review)
    except:
        pass
    
    print(f'{goodsNo} 상품 리뷰 수: {len(one_goods_reviews)}', one_goods_reviews)


    return one_goods_reviews

    

In [9]:
def main():
    all_info = []
    for num in middle_category_nums:
        middle_category_info = get_items(num)
        all_info += middle_category_info

    all_reviews = []

    for goodsNo, goodsRevCnt in all_info:
        one_goods_reviews = get_review_data(goodsNo, goodsRevCnt)
        all_reviews += one_goods_reviews


In [10]:
if __name__ == '__main__':
    main()  

1 ['TM2E5KCDC81W', 30]
2 ['SY2E3WJMTE1WM2', 67]
3 ['CM2E3WJC158WP1', 9]
4 ['TM2E4KCD261W', 24]
5 ['LC2E3WOT445WM2', 6]
6 ['LC2E3KCD654W', 22]
7 ['SY2E1WOT718W', 6]
8 ['CM2E3KCD526W', 82]
9 ['TM2E1WJC823WP6', 14]
10 ['SY2E0WJM652WM2', 6]
11 ['MN2E4WJC467W', 8]
12 ['MN2E5WOT584W', 6]
13 ['CM2E4KOT531WM1', 27]
14 ['CM2E5KCD939W', 13]
15 ['CM2E5KOT545W', 9]
16 ['SY2E3WJMTE1W', 46]
17 ['IL2E4KOT244W', 0]
18 ['IL2E4KVT341N', 21]
19 ['LC2E4WJC965W', 2]
20 ['MN2E5WJC482W', 0]
21 ['SJ2E4WJC232W', 0]
22 ['SY2E4WJC515W', 1]
23 ['TM2E3NOT950WP1', 0]
24 ['TM2E4KCDB61W', 3]
25 ['CM2E3KCD522W', 33]
26 ['CM2E4KCD013W', 27]
27 ['CM2E4KCD527W', 9]
28 ['CM2E5KOT539W', 1]
29 ['LC2E4KCD666W', 1]
30 ['OB2E4KCD737W', 3]
31 ['SJ2E4KOTQ49W', 7]
32 ['SY2E3WJMTE1WM4', 1]
33 ['TM2E4WOTB61WM1', 0]
34 ['CM2E4WOT123W', 12]
35 ['CM2E5KOT543W', 31]
36 ['LC2E4WJC967W', 0]
37 ['LC2E4WOT466W', 10]
38 ['O22E4WJM596W', 5]
39 ['TM2E3KCD241W', 26]
40 ['TM2E3WJC842W', 40]
41 ['CM2E3KCD927W', 11]
42 ['CM2E4WOT128W', 2]
43 ['CS

KeyboardInterrupt: 