In [14]:
import time
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json
import re
import sys

In [15]:
ITEMS_COUNT = 100

In [16]:
middle_category_nums = ['001006', '001004', '001005', '001010', '001002', '001003',
                        '001001', '001011', '001013', '001008', '002022', '002001',
                        '002002', '002025', '002017', '002003', '002020', '002019',
                        '002023', '002018', '002004', '002008', '002007', '002024',
                        '002009', '002013', '002012', '002016', '002021', '002014',
                        '002006', '002015', '003002', '003007', '003008', '003004',
                        '003009', '003005', '003010', '003011', '003006', '020006',
                        '020007', '020008', '022001', '022002', '022003']

In [17]:
len(middle_category_nums)

47

In [18]:
# 사용자 에이전트
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
}

timeout_settings = (60, 60)

In [19]:
def get_response(url, headers):

    # GET 요청
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')
    time.sleep(0.5)

    return soup


In [20]:
import numpy as np

def get_items(num):
    
    url = f'https://www.musinsa.com/categories/item/{num}?d_cat_cd={num}&brand=&list_kind=small&sort=sale_high&sub_sort=1d&page=1&display_cnt=90&exclusive_yn=&sale_goods=&timesale_yn=&ex_soldout=&plusDeliveryYn=&kids=&color=&price1=&price2=&shoeSizeOption=&tags=&campaign_id=&includeKeywords=&measure='
    
    product_links = []
    flag = 0
    page = 1
    soup = get_response(url, headers)
    total_page = int(soup.select('span.totalPagingNum')[0].text)
    while flag == 0:
        
        soup = get_response(url, headers)

        products = soup.select('a.img-block')
        
        for product in products:
            product_links.append(product['href'].replace('//', 'https://'))  # URL만 추가
            
            if len(product_links) == ITEMS_COUNT:
                flag = 1
                break

        page += 1
        
        if page > total_page:
            break

        url = f'https://www.musinsa.com/categories/item/{num}?d_cat_cd={num}&brand=&list_kind=small&sort=sale_high&sub_sort=1d&page={page}&display_cnt=90&exclusive_yn=&sale_goods=&timesale_yn=&ex_soldout=&plusDeliveryYn=&kids=&color=&price1=&price2=&shoeSizeOption=&tags=&campaign_id=&includeKeywords=&measure='

    product_links = np.array(product_links)

    rankings = np.arange(1, len(product_links) + 1)
    total_items_count = np.full(len(product_links), len(product_links))

    product_links = np.column_stack((product_links, rankings, total_items_count))

    return product_links.tolist() 


In [21]:
def get_item_info(item_url):
    soup = get_response(item_url, headers)
    try:
        info = soup.find_all('script', {'type':'text/javascript'})[15]
    except:
        return
    info = info.string

    pattern = re.compile(r'window\.__MSS__\.product\.state = ({.*?});\s*$', re.DOTALL)
    match = pattern.search(info)
    info = match.group(1)
    
    return json.loads(info)

In [22]:
def extract_favorite_num(goodsNo):
    url = 'https://like.musinsa.com/like/api/v2/liketypes/goods/counts'
    data = {"relationIds":[str(goodsNo)]}
    response = requests.post(url, json=data)
    soup = BeautifulSoup(response.text, 'lxml')
    info = soup.string
    favorites = json.loads(info)['data']['contents']['items'][0]['count']

    return favorites

    

In [23]:
def extract_needs_info(item_info):
    # 무신사 상품번호
    product_id = item_info['goodsNo']

    # 대분류
    top = item_info['category']['categoryDepth1Title']

    # 중분류
    middle = item_info['category']['categoryDepth2Title']

    # 브랜드 명명
    brand = item_info['brand']

    # 품번
    product_num = item_info['styleNo']

    # 좋아요 수
    likes = extract_favorite_num(product_id)

    # 무신사 판매가
    fixed_price = item_info['goodsPrice']['originPrice']

    # 무신사 회원가
    discounted_price = item_info['goodsPrice']['minPrice']

    # 상품 이미지 : 리스트에 str로 저장. 필요하면 수정
    image_urls = []
    image_urls.append(item_info['thumbnailImageUrl'])
    goodsImages = item_info['goodsImages']
    for goodsImage in goodsImages:
        image_urls.append(goodsImage['imageUrl'])

    stat_url = f'https://goods-detail.musinsa.com/goods/{product_id}/stat'
    response = requests.get(stat_url, headers=headers, timeout=timeout_settings)
    try:
        add_data = response.json()
    except:
        pass
    
    # 누적 판매
    try:
        cumulative_sales = add_data['data']['purchase']['total']
    except:
        cumulative_sales = None

    # 나이
    try:
        ages = add_data['data']['purchase']['rates']
        ages = {key: f"{value}%" for key, value in ages.items()}
        under_18 = f"{ages['AGE_UNDER_18']}%"
        age_19_to_23 = f"{ages['AGE_19_TO_23']}%"
        age_24_to_28 = f"{ages['AGE_24_TO_28']}%"
        age_29_to_33 = f"{ages['AGE_29_TO_33']}%"
        age_34_to_39 = f"{ages['AGE_34_TO_39']}%"
        over_40 = f"{ages['AGE_OVER_40']}%"
    except:
        under_18, age_19_to_23, age_24_to_28, age_29_to_33, age_34_to_39, over_40 = None, None, None, None, None, None

    # 성비
    try:
        male = int(add_data['data']['purchase']['male'])
        female = int(add_data['data']['purchase']['female'])
        total_count = male + female
        male_percentage = int(round((male / total_count) * 100, -1))
        female_percentage = int(round((female / total_count) * 100, -1))
        male_percentage = f"{male_percentage}%"
        female_percentage = f"{female_percentage}%"
    except:
        male_percentage = None
        female_percentage = None

    # 대분류
    categoryDepth1Title = item_info['category']['categoryDepth1Title']

    # 중분류
    categoryDepth2Title = item_info['category']['categoryDepth2Title']


    data = {
        'product_id': product_id,
        'top': top,
        'middle': middle,
        'brand': brand,
        'product_num': product_num,
        'likes': likes,
        'fixed_price': fixed_price,
        'discounted_price': discounted_price,
        'categoryDepth1Title': categoryDepth1Title,
        'categoryDepth2Title': categoryDepth2Title,
        'url': image_urls,
        'cumulative_sales': cumulative_sales,
        'under_18': under_18,
        '19_to_23': age_19_to_23,
        '24_to_28': age_24_to_28,
        '29_to_33': age_29_to_33,
        '34_to_39': age_34_to_39,
        'over_40': over_40,
        'male_percentage': male_percentage,
        'female_percentage': female_percentage,   
    }

    return data




In [24]:
def get_rank_score(ranking, item_count):
    rank_score = 1 - ((ranking - 1) / (item_count - 1))

    return rank_score

In [25]:
def main():
    all_product_links = []
    
    for middle_category_num in middle_category_nums:
        product_links = get_items(middle_category_num)
        all_product_links += product_links
        print(len(all_product_links), product_links[-1])
    
    items_list = []

    for item_url in all_product_links:
        item_info = get_item_info(item_url[0])
        
        if item_info is None: # get_item_info로 해당 상품 정보 못 가져왔을 때. 
            continue          # 필요에 맞게 수정 필요.

        needs_info = extract_needs_info(item_info)
        needs_info['rank_score'] = get_rank_score(int(item_url[1]) , int(item_url[2]))
        items_list.append(needs_info)
        print(needs_info)
    
    df = pd.DataFrame(items_list)
    print(df)
    df.to_csv('musinsa_0504.csv', index=False)
        

In [26]:
if __name__ == "__main__":
    main()

100 ['https://www.musinsa.com/app/goods/4034801', '100', '100']
200 ['https://www.musinsa.com/app/goods/3247228', '100', '100']
300 ['https://www.musinsa.com/app/goods/2056472', '100', '100']
400 ['https://www.musinsa.com/app/goods/1809545', '100', '100']
500 ['https://www.musinsa.com/app/goods/3432758', '100', '100']
600 ['https://www.musinsa.com/app/goods/2461449', '100', '100']
700 ['https://www.musinsa.com/app/goods/3153755', '100', '100']
800 ['https://www.musinsa.com/app/goods/3157541', '100', '100']
900 ['https://www.musinsa.com/app/goods/3122206', '100', '100']
1000 ['https://www.musinsa.com/app/goods/4053303', '100', '100']
1100 ['https://www.musinsa.com/app/goods/3816863', '100', '100']
1200 ['https://www.musinsa.com/app/goods/3783500', '100', '100']
1300 ['https://www.musinsa.com/app/goods/4046517', '100', '100']
1400 ['https://www.musinsa.com/app/goods/3742880', '100', '100']
1500 ['https://www.musinsa.com/app/goods/3844095', '100', '100']
1600 ['https://www.musinsa.com/app

In [27]:
pd.read_csv('musinsa_0504.csv')

Unnamed: 0,product_id,top,middle,brand,product_num,likes,fixed_price,discounted_price,categoryDepth1Title,categoryDepth2Title,...,cumulative_sales,under_18,19_to_23,24_to_28,29_to_33,34_to_39,over_40,male_percentage,female_percentage,rank_score
0,4071027,상의,니트/스웨터,lafudgestore,LA 2789478,1415,78000,38329,상의,니트/스웨터,...,510.0,1%%,12%%,30%%,27%%,17%%,13%%,70%,30%,1.000000
1,3143788,상의,니트/스웨터,kiimuir,KBCU3KS001,6806,39000,26603,상의,니트/스웨터,...,5575.0,3%%,18%%,28%%,25%%,14%%,12%%,80%,20%,0.989899
2,2474966,상의,니트/스웨터,xtonz,xtk016BK,16760,49000,42292,상의,니트/스웨터,...,6174.0,9%%,33%%,31%%,15%%,3%%,9%%,90%,10%,0.979798
3,4078476,상의,니트/스웨터,unionblue,UNION-KN07GP,1579,79000,61711,상의,니트/스웨터,...,761.0,0%%,4%%,36%%,38%%,17%%,5%%,100%,0%,0.969697
4,3143446,상의,니트/스웨터,kiimuir,KBDUUKS005,6495,49000,33433,상의,니트/스웨터,...,3211.0,4%%,22%%,31%%,22%%,11%%,10%%,90%,10%,0.959596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4695,4014271,스커트,롱스커트,oceanpacific,WOPC6SCSKZ21,15,69000,50860,스커트,롱스커트,...,,,,,,,,,,0.040404
4696,4011404,스커트,롱스커트,dunstforwomen,UDSK4B228I2,21,119000,103893,스커트,롱스커트,...,,,,,,,,,,0.030303
4697,4004707,스커트,롱스커트,enzoblues,242SPLSKT015,3,99000,76131,스커트,롱스커트,...,,,,,,,,,,0.020202
4698,4002604,스커트,롱스커트,chicks,5009911067,429,69800,31154,스커트,롱스커트,...,35.0,0%%,51%%,29%%,2%%,9%%,9%%,0%,100%,0.010101
