In [1]:
import time
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json
import re

In [2]:
ITEMS_COUNT = 100

In [3]:
middle_category_nums = ['001006', '001004', '001005', '001010', '001002', '001003',
                        '001001', '001011', '001013', '001008', '002022', '002001',
                        '002002', '002025', '002017', '002003', '002020', '002019',
                        '002023', '002018', '002004', '002008', '002007', '002024',
                        '002009', '002013', '002012', '002016', '002021', '002014',
                        '002006', '002015', '003002', '003007', '003008', '003004',
                        '003009', '003005', '003010', '003011', '003006', '002006',
                        '002007', '002008', '022001', '022002', '022003']

In [4]:
len(middle_category_nums)

47

In [5]:
# 사용자 에이전트
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}

In [6]:
def get_response(url, headers):

    # GET 요청
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')
    time.sleep(0.5)

    return soup


In [7]:
def get_items(num):
    
    url = f'https://www.musinsa.com/categories/item/{num}?d_cat_cd={num}&brand=&list_kind=small&sort=sale_high&sub_sort=1d&page=1&display_cnt=90&exclusive_yn=&sale_goods=&timesale_yn=&ex_soldout=&plusDeliveryYn=&kids=&color=&price1=&price2=&shoeSizeOption=&tags=&campaign_id=&includeKeywords=&measure='
    
    product_links = []
    flag = 0
    page = 1
    while flag == 0:
        
        soup = get_response(url, headers)

        products = soup.select('a.img-block')
        
        for product in products:
            product_links.append(product['href'].replace('//', 'https://'))
            if len(product_links) == ITEMS_COUNT:
                flag = 1
                break

        page += 1
        url = f'https://www.musinsa.com/categories/item/{num}?d_cat_cd={num}&brand=&list_kind=small&sort=sale_high&sub_sort=1d&page={page}&display_cnt=90&exclusive_yn=&sale_goods=&timesale_yn=&ex_soldout=&plusDeliveryYn=&kids=&color=&price1=&price2=&shoeSizeOption=&tags=&campaign_id=&includeKeywords=&measure='

    return product_links
        


In [8]:
def get_item_info(item_url):
    soup = get_response(item_url, headers)
    info = soup.find_all('script', {'type':'text/javascript'})[15]
    info = info.string

    pattern = re.compile(r'window\.__MSS__\.product\.state = ({.*?});\s*$', re.DOTALL)
    match = pattern.search(info)
    info = match.group(1)
    
    return json.loads(info)

In [9]:
# item_url = 'https://www.musinsa.com/app/goods/3175105'
# soup = get_response(item_url, headers)

# info = soup.find_all('script', {'type':'text/javascript'})[15]
# info = info.string

# pattern = re.compile(r'window\.__MSS__\.product\.state = ({.*?});\s*$', re.DOTALL)
# match = pattern.search(info)
# info = match.group(1)

# info = json.loads(info)

In [10]:
# extract_needs_info(info)

In [11]:
def extract_needs_info(item_info):
    # 무신사 상품번호
    goodsNo = item_info['goodsNo']

    # 브랜드 명명
    brand = item_info['brand']

    # 품번
    styleNo = item_info['styleNo']

    # 무신사 판매가
    originPrice = item_info['goodsPrice']['originPrice']

    # 무신사 회원가
    memberPrice = item_info['goodsPrice']['minPrice']

    # 상품 이미지 : 리스트에 str로 저장. 필요하면 수정
    image_urls = []
    image_urls.append(item_info['thumbnailImageUrl'])
    goodsImages = item_info['goodsImages']
    for goodsImage in goodsImages:
        image_urls.append(goodsImage['imageUrl'])
    
    # 누적 판매

    # 좋아요 수

    # 나이

    # 성비


    # 대분류
    categoryDepth1Title = item_info['category']['categoryDepth1Title']

    # 중분류
    categoryDepth2Title = item_info['category']['categoryDepth2Title']

    data = {
        'goodsNo': goodsNo,
        'brand': brand,
        'styleNo': styleNo,
        'originPrice': originPrice,
        'memberPrice': memberPrice,
        'categoryDepth1Title': categoryDepth1Title,
        'categoryDepth2Title': categoryDepth2Title
    }

    return data




In [12]:
def main():
    all_product_links = []
    for middle_category_num in middle_category_nums:
        product_links = get_items(middle_category_num)
        all_product_links += product_links
        print(len(all_product_links), all_product_links[-1])
    
    items_list = []

    for item_url in all_product_links:
        item_info = get_item_info(item_url)
        needs_info = extract_needs_info(item_info)
        items_list.append(needs_info)
        print(needs_info)
    
    df = pd.DataFrame(items_list)
    print(df)
        

In [13]:
if __name__ == "__main__":
    main()

100 https://www.musinsa.com/app/goods/3235610
200 https://www.musinsa.com/app/goods/1571500
300 https://www.musinsa.com/app/goods/3623472
400 https://www.musinsa.com/app/goods/3603937
500 https://www.musinsa.com/app/goods/3461175
600 https://www.musinsa.com/app/goods/3353188
700 https://www.musinsa.com/app/goods/3155170
800 https://www.musinsa.com/app/goods/3842009
900 https://www.musinsa.com/app/goods/3064786
1000 https://www.musinsa.com/app/goods/3340212
1100 https://www.musinsa.com/app/goods/3813579
1200 https://www.musinsa.com/app/goods/3574347
1300 https://www.musinsa.com/app/goods/3781918
1400 https://www.musinsa.com/app/goods/3738790
1500 https://www.musinsa.com/app/goods/3814900
1600 https://www.musinsa.com/app/goods/2398613
1700 https://www.musinsa.com/app/goods/1567181
1800 https://www.musinsa.com/app/goods/3832836
1900 https://www.musinsa.com/app/goods/3767140
2000 https://www.musinsa.com/app/goods/3923615
2100 https://www.musinsa.com/app/goods/4017495
2200 https://www.musin