In [1]:
import time
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json
import datetime


ITEMS_COUNT = 10

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
    
}

timeout_settings = (60, 60)

In [3]:
def get_response(url, headers):

    # GET 요청
    response = requests.get(url, headers=headers, timeout=timeout_settings)
    soup = BeautifulSoup(response.text, 'lxml')
    time.sleep(0.5)

    return soup


In [4]:
def get_small_categories():
    url = 'https://pcw.thehandsome.com/api/display/1/ko/category/categoryList?dispCtgNo=mainDispCtgNo'

    soup = get_response(url, headers)
    info = soup.string
    categories = json.loads(info)['payload']['lrgCtgList']
    category_index = []
    gender_type = ['여성', '남성']
    for i, gender in enumerate(gender_type):
        for j in range(1, len(categories[i]['midCtgList'])):
            middle_category_num = categories[i]['midCtgList'][j]['dispCtgNo']
            middle_category_name = categories[i]['midCtgList'][j]['dispCtgNm']

            if j == len(categories[i]['midCtgList']) - 1:
                start_num = 0
            else:
                start_num = 1
                
            for k in range(start_num, len(categories[i]['midCtgList'][j]['smlCtgList'])):
                small_category_num = categories[i]['midCtgList'][j]['smlCtgList'][k]['dispCtgNo']
                small_category_name = categories[i]['midCtgList'][j]['smlCtgList'][k]['dispCtgNm']
                category_index.append({'top': gender,'middle': [middle_category_num, middle_category_name], 'small': [small_category_num, small_category_name]})

    return category_index


In [5]:
test = get_small_categories()

In [6]:
test

[{'top': '여성', 'middle': ['367', '아우터'], 'small': ['388', '자켓']},
 {'top': '여성', 'middle': ['367', '아우터'], 'small': ['461', '점퍼']},
 {'top': '여성', 'middle': ['367', '아우터'], 'small': ['5008', '트렌치 코트']},
 {'top': '여성', 'middle': ['367', '아우터'], 'small': ['2078', '가디건/베스트']},
 {'top': '여성', 'middle': ['367', '아우터'], 'small': ['503', '코트']},
 {'top': '여성', 'middle': ['367', '아우터'], 'small': ['513', '다운/패딩']},
 {'top': '여성', 'middle': ['596', '탑'], 'small': ['972', '티셔츠']},
 {'top': '여성', 'middle': ['596', '탑'], 'small': ['5001', '스웻셔츠']},
 {'top': '여성', 'middle': ['596', '탑'], 'small': ['971', '블라우스']},
 {'top': '여성', 'middle': ['596', '탑'], 'small': ['970', '셔츠']},
 {'top': '여성', 'middle': ['596', '탑'], 'small': ['5002', '니트']},
 {'top': '여성', 'middle': ['2074', '니트'], 'small': ['2082', '탑']},
 {'top': '여성', 'middle': ['2074', '니트'], 'small': ['2081', '가디건/베스트']},
 {'top': '여성', 'middle': ['2074', '니트'], 'small': ['14984', '팬츠']},
 {'top': '여성', 'middle': ['2074', '니트'], 'small': ['14994

In [7]:
def get_item_info(goods):
    # 상품 id
    goodsNo = goods['goodsNo']

    # 브랜드 명
    brandNm = goods['brandNm']

    # 정가
    norPrc = goods['norPrc']

    # 할인가
    salePrc = goods['salePrc']

    # 이미지    
    image_urls = []      
    try:    
        for i in range(len(goods['colorInfo'])):
            for j in range(len(goods['colorInfo'][i]['colorContInfo'])):
                check = goods['colorInfo'][i]['colorContInfo'][j]['imgGbCd']
                if 'P' in check:
                    colorContInfo = goods['colorInfo'][i]['colorContInfo'][j]['dispGoodsContUrl']
                    image_urls.append(colorContInfo)
    except:
        pass

    # 색상 정보
    colors = []
    try:
        for color in goods['colorInfo']:
            colors.append(color['optnNm'])
    except:
        pass

    # 사이즈 정보
    sizes = []
    for i in range(len(goods['colorInfo'][0]['colorSizeInfo'])):
        size_info = goods['colorInfo'][0]['colorSizeInfo'][i]['erpSzCd']
        sizes.append(size_info)


    data = {
        'product_id': goodsNo,
        'brand': brandNm,
        'fixed_price': norPrc,
        'discounted_price': salePrc,
        'url': image_urls,
        'colors': colors,
        'sizes': sizes,
    }

    return data

In [8]:
def get_rank_score(ranking, item_count):
    try:
        rank_score = 1 - ((ranking - 1) / (item_count - 1))
    except:
        rank_score = 1

    return rank_score

In [9]:
def get_items(index):
    small_category_num = index['small'][0]

    url = f'https://www.thehandsome.com/api/display/1/ko/category/categoryGoodsList?dispMediaCd=10&sortGbn=20&pageSize={ITEMS_COUNT}&pageNo=1&norOutletGbCd=J&dispCtgNo={small_category_num}&productListLayout=4&theditedYn=N'

    goods_info = []

    soup = get_response(url, headers)
    info = soup.string
    goods_in_page = json.loads(info)['payload']['goodsList']

    for rank, goods in enumerate(goods_in_page, 1):
        
        goods_data = get_item_info(goods)
        goods_data['rank_score'] = get_rank_score(int(rank), len(goods_in_page))
        goods_data['top'] = index['top']
        goods_data['middle'] = index['middle'][1]
        goods_data['small'] = index['small'][1]
        goods_info.append(goods_data)
        print(len(goods_info), goods_data)


    return goods_info

In [10]:
def get_additional_info(product_id):
    url = f'https://pcw.thehandsome.com/ko/PM/productDetail/{product_id}?itmNo=003'

    soup = get_response(url, headers=headers)

    # 브랜드2
    brand2 = soup.select('ul.prd-category > li > a')[0].text.strip()

    # 상품 설명
    try:
        product_info = soup.select('div.prd-desc-box')[0].text
    except:
        product_info = None

    # 피팅 정보
    try:
        fitting_info = soup.select('p.cmp-font')[0].text
    except:
        fitting_info = None

    # 상품 추가 정보
    try:
        additional_infos = soup.select('ul.cmp-list.list-dotType2.bottom6')
        additional_info_processed = []
        
        for info in additional_infos:
            additional_info_processed.append(info.text)
    except:
        additional_info_processed = None

    data = {
        'product_id': product_id,
        'product_info': product_info,
        'fitting_info': fitting_info,
        'additional_info': additional_info_processed,
        'brand2': brand2
        
    }
    
    return data

In [11]:
def main():
    category_index = get_small_categories()
    all_info = []
    for index in category_index:
        small_category_info = get_items(index)
        all_info += small_category_info

    id_list = [item['product_id'] for item in all_info]

    df_all_info = pd.DataFrame(all_info)
    
    all_additional_info = []
    for product_id in id_list:
        additional_info = get_additional_info(product_id)
        all_additional_info.append(additional_info)
        print(additional_info)

    df_all_additional_info = pd.DataFrame(all_additional_info)

    df_merged = pd.merge(df_all_info, df_all_additional_info, on='product_id', how='left')

    today = datetime.date.today()
    formatted_date = today.strftime('%Y%m%d')

    df_merged.to_csv(f'handsome_{formatted_date}.csv', index=False)


In [12]:
if __name__ == '__main__':
    main()  

1 {'product_id': 'MN2E5WJC483W', 'brand': 'MINE', 'fixed_price': 595000, 'discounted_price': 595000, 'url': ['/MN/2E/SS/MN2E5WJC483W_NL_W01.jpg?rs=684X1032', '/MN/2E/SS/MN2E5WJC483W_NL_W02.jpg?rs=684X1032', '/MN/2E/SS/MN2E5WJC483W_NL_W03.jpg?rs=684X1032', '/MN/2E/SS/MN2E5WJC483W_NL_W04.jpg?rs=684X1032', '/MN/2E/SS/MN2E5WJC483W_NL_W05.jpg?rs=684X1032', '/MN/2E/SS/MN2E5WJC483W_NL_W06.jpg?rs=684X1032', '/MN/2E/SS/MN2E5WJC483W_NL_W07.jpg?rs=684X1032', '/MN/2E/SS/MN2E5WJC483W_NL_W08.jpg?rs=684X1032', '/MN/2E/SS/MN2E5WJC483W_NL_W09.jpg?rs=684X1032'], 'colors': ['NEUTRAL'], 'sizes': ['76', '82', '88'], 'rank_score': 1.0, 'top': '여성', 'middle': '아우터', 'small': '자켓'}
2 {'product_id': 'SJ2E4WJC232W', 'brand': 'SJSJ', 'fixed_price': 475000, 'discounted_price': 475000, 'url': ['/SJ/2E/SS/SJ2E4WJC232W_DN_W01.jpg?rs=684X1032', '/SJ/2E/SS/SJ2E4WJC232W_DN_W02.jpg?rs=684X1032', '/SJ/2E/SS/SJ2E4WJC232W_DN_W03.jpg?rs=684X1032', '/SJ/2E/SS/SJ2E4WJC232W_DN_W04.jpg?rs=684X1032', '/SJ/2E/SS/SJ2E4WJC232W_DN_W

In [13]:
df = pd.read_csv("handsome_0504.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'handsome_0504.csv'

In [None]:
len(df)

5628

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5628 entries, 0 to 5627
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   product_id        5628 non-null   object 
 1   brand             5628 non-null   object 
 2   fixed_price       5628 non-null   int64  
 3   discounted_price  5628 non-null   int64  
 4   url               5628 non-null   object 
 5   colors            5628 non-null   object 
 6   sizes             5628 non-null   object 
 7   rank_score        5628 non-null   float64
 8   top               5628 non-null   object 
 9   middle            5628 non-null   object 
 10  small             5628 non-null   object 
 11  product_info      5624 non-null   object 
 12  fitting_info      5413 non-null   object 
 13  additional_info   5628 non-null   object 
dtypes: float64(1), int64(2), object(11)
memory usage: 615.7+ KB


In [None]:
df[(df['product_info'].isnull())]

Unnamed: 0,product_id,brand,fixed_price,discounted_price,url,colors,sizes,rank_score,top,middle,small,product_info,fitting_info,additional_info
4098,TH2D9WCT410N,TIME HOMME,1050000,1050000,"['/TH/2D/FW/TH2D9WCT410N_DN_C01.jpg', '/TH/2D/...","['DARK NAVY', 'DARK GRAY KHAKI', 'BLACK']","['95', '100', '110']",0.1,남성,아우터,코트,,"185cm, 100사이즈 모델, 3 사이즈 착용186cm, 105 사이즈 모델이 1...","['램스 울과 캐시미어 혼방 소재', '앞면에 버튼 클로징', '양옆에 포켓', '..."
4099,TH2D9WCT410N,TIME HOMME,1050000,1050000,"['/TH/2D/FW/TH2D9WCT410N_DN_C01.jpg', '/TH/2D/...","['DARK NAVY', 'DARK GRAY KHAKI', 'BLACK']","['95', '100', '110']",0.1,남성,아우터,코트,,"185cm, 100사이즈 모델, 3 사이즈 착용186cm, 105 사이즈 모델이 1...","['램스 울과 캐시미어 혼방 소재', '앞면에 버튼 클로징', '양옆에 포켓', '..."
5575,TH2D9WCT410N,TIME HOMME,1050000,1050000,"['/TH/2D/FW/TH2D9WCT410N_DN_C01.jpg', '/TH/2D/...","['DARK NAVY', 'DARK GRAY KHAKI', 'BLACK']","['95', '100', '110']",0.6875,남성,캡슐 컬렉션*,TIME HOMME : 23 WINTER EXCLUSIVE,,"185cm, 100사이즈 모델, 3 사이즈 착용186cm, 105 사이즈 모델이 1...","['램스 울과 캐시미어 혼방 소재', '앞면에 버튼 클로징', '양옆에 포켓', '..."
5576,TH2D9WCT410N,TIME HOMME,1050000,1050000,"['/TH/2D/FW/TH2D9WCT410N_DN_C01.jpg', '/TH/2D/...","['DARK NAVY', 'DARK GRAY KHAKI', 'BLACK']","['95', '100', '110']",0.6875,남성,캡슐 컬렉션*,TIME HOMME : 23 WINTER EXCLUSIVE,,"185cm, 100사이즈 모델, 3 사이즈 착용186cm, 105 사이즈 모델이 1...","['램스 울과 캐시미어 혼방 소재', '앞면에 버튼 클로징', '양옆에 포켓', '..."


In [None]:
df['product_id'].value_counts()

TM2E5KCDC81WP1    9
MN2E4KCD761WM     9
CM2E4KOT531W      9
CM2E3KCD917W      9
TH2D8KVT031N      9
                 ..
MN2E0WSC902W      1
O22E3WSC926W      1
O22E4WSC931W      1
OB2E1WSC405WP2    1
TH2D9WSH717NM1    1
Name: product_id, Length: 3267, dtype: int64

In [None]:
df[df['product_id'] == 'TH2D9WSH717NM1']

Unnamed: 0,product_id,brand,fixed_price,discounted_price,url,colors,sizes,rank_score,top,middle,small,product_info,fitting_info,additional_info
5623,TH2D9WSH717NM1,TIME HOMME,325000,325000,"['/TH/2D/FW/TH2D9WSH717NM1_BG_C01.jpg', '/TH/2...",['BEIGE'],"['95', '100', '105']",0.058824,남성,캡슐 컬렉션*,TIME HOMME : 23 FALL EXCLUSIVE,부드럽고 입체감 있는 100% 면 소재와 앞뒤 기장이 다른 언밸런스 헴 디자인이 어...,"186cm, 105 사이즈 모델이 105 사이즈 착용","['100% 면 소재', '앞면에 버튼 클로징', '소맷단 버튼 여밈', '언밸런스..."
