In [68]:
import time
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json
import re

ITEM_COUNTS = 100

In [69]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
    'Display-Api-Key': 'VWmkUPgs6g2fviPZ5JQFQ3pERP4tIXv/J2jppLqSRBk='
}

In [70]:
middle_category_nums = ['10101201', '10101202', '10101203', '10101204', '10101205',
                        '10101206', '10101207', '10101208', '10101209', '10101210',
                        '10101211', '10101212']

In [71]:
def get_item_cds(middle_category_num, gender):
    url = 'https://api-display.wconcept.co.kr/display/api/v2/best/products'

    if gender == 'men':
        genderType = 'men'
    else:
        genderType = 'women'

    data = {
    "custNo": "",
    "dateType": "daily",
    "domain": 'WOMEN',
    "genderType": genderType,
    "depth1Code": "10101",
    "depth2Code": middle_category_num,
    "pageNo": 1,
    "pageSize": ITEM_COUNTS
    }

    response = requests.post(url, headers=headers, json=data)

    soup = BeautifulSoup(response.text, 'lxml')
    info = soup.string
    info = json.loads(info)
    info = info['data']['content']

    item_cds = []

    for item in info:
        item_cds.append(item['itemCd'])
        

    return item_cds

In [72]:
def get_item_payloads(item_cd):
    url = 'https://www.wconcept.co.kr/Ajax/GetProductsInfo'

    data = {'itemcds': item_cd}

    response = requests.post(url, headers=headers, data=data)

    soup = BeautifulSoup(response.text, 'lxml')
    info = soup.string
    info = json.loads(info)[0]

    item_cd = info['itemCd']
    medium_cd = info['category'][0]['mediumCd']
    category_cd = info['category'][0]['categoryCd']
    itemtypecd = info['itemTypeCd']

    item_payloads = [item_cd, medium_cd, category_cd, itemtypecd]
    
    print(item_payloads)
    
    return item_payloads

In [85]:
def get_one_review(i, soup, itemCd):
    # 구매 옵션과 사이즈정보 빼내기 용
    review_info = soup.select('div.pdt_review_info')[i]
    
    # 구매 옵션
    try:
        option = review_info.select('div.pdt_review_option > p')[0].text.strip()
        option = option.split(':')[1].strip()
    except:
        option = None

    # 사이즈 정보
    try:
        cust_size_info = review_info.select('div.pdt_review_option > p')[1].text.strip()
        cust_size_info = cust_size_info.split(':')[1].strip()
    except:
        cust_size_info = None

    # 사이즈, 색상, 소재 빼내기 용
    try:
        sku = soup.select('ul.product_review_evaluation')[i]       

        # 사이즈
        size = sku.select('ul.product_review_evaluation > li > div > em')[0].text

        # 색상
        color = sku.select('ul.product_review_evaluation > li > div > em')[1].text

        # 소재
        texture = sku.select('ul.product_review_evaluation > li > div > em')[2].text
    
    except:
        size, color, texture = None, None, None



    # user id
    user_id = soup.select('p.product_review_info_right > em')[i].text

    # 작성 시간
    time = soup.select('p.product_review_info_right > span')[i].text

    # 리뷰 내용
    content = soup.select('p.pdt_review_text')[i].text.strip()

    # rating 정보
    rating_pct = soup.select('div.star-grade > strong[style]')[i]
    rating = re.findall(r'\d+', str(rating_pct))[0]
    rating = int(int(rating) / 20)

    # 좋아요 개수
    favorite = soup.select('button.like.btn_review_recommend')[i].text


    data = {
        'itemCd': itemCd,
        'option': option,
        'cust_size_info': cust_size_info,
        'size': size,
        'color': color,
        'texture': texture,
        'user_id': user_id,
        'time': time,
        'content': content,
        'rating': rating,
        'favorite': favorite
    }

    return data

In [86]:
def get_reviews(item_payload):
    url = 'https://www.wconcept.co.kr/Ajax/ProductReViewList'
    i = 1
    one_goods_reviews = []
    while True:
        data = {'itemcd': item_payload[0],
            'pageIndex': i,
            'order': 1,
            'IsPrdCombinOpt': 'N',
            'mediumcd': item_payload[1],
            'categorycd': item_payload[2],
            'itemtypecd': item_payload[3]
            }

        response = requests.post(url, headers=headers, data=data)
        soup = BeautifulSoup(response.text, 'lxml')

        review_count = len(soup.select('p.pdt_review_text'))

        if review_count == 0:
            break
        
        for j in range(review_count):
            review = get_one_review(j, soup, item_payload[0])
            one_goods_reviews.append(review)
            print(review)
        
        i += 1

    return one_goods_reviews
        

In [87]:
def main():
    item_cds_list = []
    for gender in ['men', 'women']:
        for middle_category_num in middle_category_nums:
            item_cds = get_item_cds(middle_category_num, gender)
            item_cds_list += item_cds
    print(item_cds_list)

    item_payloads_list = []
    for item_cd in item_cds_list:
        item_payloads_list.append(get_item_payloads(item_cd))


    all_reviews = []

    for item_payloads in item_payloads_list:
        review_per_goods = get_reviews(item_payloads)
        if len(review_per_goods) == 0:
            continue
        else:
            all_reviews += review_per_goods
            

In [88]:
if __name__ == '__main__':
    main()

['301515278', '300605081', '303279261', '305788464', '305780130', '305776033', '300515072', '303355452', '305777670', '302280251', '302746077', '302138248', '301705854', '301269588', '302480752', '304473041', '305691740', '305727041', '301712935', '305811864', '305773508', '305773506', '303665161', '302251100', '305691723', '305785974', '302247616', '305694537', '305785964', '305727031', '301876557', '304473119', '305408927', '305408925', '303452844', '301047578', '305352957', '305743625', '302256748', '303355449', '305782065', '305821486', '303788016', '305771126', '301840142', '304204656', '301857136', '305430643', '302329396', '302496378', '305716777', '305283795', '305283705', '305283667', '302329389', '302745862', '305686510', '305741222', '305682443', '301656756', '301286692', '302746672', '302800679', '305773539', '305719357', '305733541', '303545024', '303583334', '302751987', '302793560', '305736864', '303187502', '301419452', '303437256', '302725849', '302906184', '302717027'

ConnectionError: HTTPSConnectionPool(host='www.wconcept.co.kr', port=443): Max retries exceeded with url: /Ajax/ProductReViewList (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001F4C13F4950>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))