In [1]:
import time
from bs4 import BeautifulSoup
import pandas as pd
import requests
from requests.exceptions import Timeout
import xml.etree.ElementTree as ET
import json

ITEMS_COUNT = 100

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
    
}

timeout_settings = (60, 60)

In [3]:
middle_category_nums = ['367', '596', '2074', '595', '980', '981', 
                        '1477', '657', '1004', '2080', '1003', '1002',
                        '2224']

In [4]:
def get_response(url, headers):

    # GET 요청
    response = requests.get(url, headers=headers, timeout=timeout_settings)
    soup = BeautifulSoup(response.text, 'lxml')
    time.sleep(0.5)

    return soup


In [5]:
def get_item_info(goods):
    # 상품 id
    goodsNo = goods['goodsNo']

    # 브랜드 명
    brandNm = goods['brandNm']

    # 정가
    norPrc = goods['norPrc']

    # 할인가
    salePrc = goods['salePrc']

    # 이미지    
    image_urls = []      
    try:    
        for i in range(len(goods['colorInfo'])):
            for j in range(len(goods['colorInfo'][i]['colorContInfo'])):
                colorContInfo = goods['colorInfo'][i]['colorContInfo'][j]['dispGoodsContUrl']
                image_urls.append(colorContInfo)
    except:
        pass

    # 색상 정보
    colors = []
    try:
        for color in goods['colorInfo']:
            colors.append(color['optnNm'])
    except:
        pass

    # 사이즈 정보
    sizes = []
    for i in range(len(goods['colorInfo'][0]['colorSizeInfo'])):
        size_info = goods['colorInfo'][0]['colorSizeInfo'][i]['erpSzCd']
        sizes.append(size_info)

    # 리뷰 갯수
    goodsRevCnt = goods['goodsRevCnt']

    data = {
        'goodsNo': goodsNo,
        'brandNm': brandNm,
        'norPrce': norPrc,
        'salePrc': salePrc,
        'image_urls': image_urls,
        'colors': colors,
        'sizes': sizes,
        'goodsRevCnt': goodsRevCnt
    }

    return data

In [6]:
def get_items(num):
    url = f'https://www.thehandsome.com/api/display/1/ko/category/categoryGoodsList?dispMediaCd=10&sortGbn=20&pageSize={ITEMS_COUNT}&pageNo=1&norOutletGbCd=J&dispCtgNo={num}&productListLayout=4&theditedYn=N'

    goods_info = []

    soup = get_response(url, headers)
    info = soup.string
    goods_in_page = json.loads(info)['payload']['goodsList']

    for goods in goods_in_page:
        
        goods_data = get_item_info(goods)
        goods_info.append(goods_data)
        print(len(goods_info), goods_data)


    return goods_info

In [7]:
def get_one_review(review):
    
    # 상품 id
    goodsNo = review['goodsNo']

    # 내용
    revCont = review['revCont']

    # 키
    try:
        height = review['revPrfleList'][0]['mbrPrfleValNm']
    except:
        height = None
    
    # 평소 사이즈
    try:
        nor_size = review['revPrfleList'][1]['mbrPrfleValNm']
    except:
        nor_size = None

    # 체형 정보
    form_info = {'height': height,
                 'nor_size': nor_size}

    data = {
        'goodsNo': goodsNo,
        'revCont': revCont,
        'form_info': form_info
    }

    return data

In [23]:
def get_review_data(goodsNo, goodsRevCnt):
    url = f'https://www.thehandsome.com/api/goods/1/ko/goods/{goodsNo}/reviews?sortTypeCd=latest&revGbCd=&pageSize={goodsRevCnt}&pageNo=1'

    one_goods_reviews = []

    soup = get_response(url, headers)
    reviews = soup.string

    try:
        reviews = json.loads(reviews)['payload']['revAllList']

        for review in reviews:
            one_review = get_one_review(review)
            one_goods_reviews.append(one_review)
    except:
        pass
    
    print(f'{goodsNo} 상품 리뷰 수: {len(one_goods_reviews)}')


    return one_goods_reviews

    

In [24]:
def main():
    all_info = []
    for num in middle_category_nums:
        middle_category_info = get_items(num)
        all_info += middle_category_info
    
    goodsNos = []

    for info in all_info:
        goodsNos.append([info['goodsNo'], info['goodsRevCnt']])

    all_reviews = []

    for goodsNo, goodsRevCnt in goodsNos:
        one_goods_reviews = get_review_data(goodsNo, goodsRevCnt)
        all_reviews += one_goods_reviews

    df_all_info = pd.DataFrame(all_info)
    df_all_reviews = pd.DataFrame(all_reviews)

    df_all_info.to_csv('../all_info.csv', index=False)
    df_all_reviews.to_csv('../all_reviews.csv', index=False)

In [25]:
if __name__ == '__main__':
    main()  

1 {'goodsNo': 'SY2E3WJMTE1WM2', 'brandNm': 'SYSTEM', 'norPrce': 375000, 'salePrc': 375000, 'image_urls': ['/SY/2E/SS/SY2E3WJMTE1WM2_BK_C01.jpg', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W01.jpg?rs=684X1032', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W02.jpg?rs=684X1032', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W03.jpg?rs=684X1032', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W04.jpg?rs=684X1032', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W05.jpg?rs=684X1032', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W06.jpg?rs=684X1032', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W07.jpg?rs=684X1032', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_S01.jpg', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_T01.jpg', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_T02.jpg', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W01.jpg', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W02.jpg', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W03.jpg', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W04.jpg', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W05.jpg', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W06.jpg', '/SY/2E/SS/SY2E3WJMTE1WM2_BK_W07.jpg'], 'colors': ['BLACK'], 'sizes': ['82'], 'goodsRevCnt': 41}
2 {'goodsNo': 'TG2E3KCD049HDG', 'brandNm': 'TOM GR

JSONDecodeError: Expecting ',' delimiter: line 1 column 2593 (char 2592)