In [83]:
from requests import request
from requests.compat import *
from bs4 import BeautifulSoup
from user_agent import generate_user_agent
import pandas as pd
import math
import time

def makePageDF(category_id, page_id):
    url = 'https://search.shopping.naver.com/search/category/' + str(category_id)
    params = {
        'pagingIndex' : str(page_id),
        'pagingSize' : '80',
        'productSet' : 'model'
    }
    headers = {'User-Agent': 'Yeti',}
    resp = request('GET',url=url, params=params, headers=headers)
    print(resp.request.url)
    dom = BeautifulSoup(resp.text, 'html.parser')
    script_tag = dom.find("script", {"id": "__NEXT_DATA__"})
    data = json.loads(script_tag.text)
    itemList = data['props']['pageProps']['initialState']['products']
    
    
    # 각 상품의 정보를 저장할 리스트를 생성합니다.
    products_info = []

    for item in itemList['list'][:80]:

        attributes_dict = dict()
        # 'attributeValue'와 'characterValue' 문자열을 리스트로 분리합니다.
        attribute_values = item['item'].get('attributeValue','').split('|')
        character_values = item['item'].get('characterValue','').split('|')

        # 'attributeValue' 리스트의 각 요소에서 '_M' 문자열을 제거합니다.
        attribute_values = [value.replace('_M', '') for value in attribute_values]

        min_len = min(len(attribute_values), len(character_values))

        for j in range(min_len):
            attribute = attribute_values[j]
            character = character_values[j]

            if attribute in attributes_dict:
                attributes_dict[attribute].append(character)
            else:
                attributes_dict[attribute] = [character]

        attribute_list = [item['item']['category3Name']]

        for i in attributes_dict:
            if i not in ['용량', '구성', '']:
                attribute_list = attribute_list + attributes_dict[i]
            else:
                continue

        dict_data = {
            'ID': item['item']['id'],
            '상품명': item['item']['productName'],
            '상품 카테고리 대분류': item['item']['category1Name'],
            '상품 카테고리 중분류': item['item']['category2Name'],
            '상품 카테고리 소분류': item['item']['category3Name'],
            '제조사': item['item'].get('maker',''),
            '브랜드': item['item'].get('brand',''),
            '특징': attribute_list
        }
        for attribute in attribute_list:
            dict_data[attribute] = True

        products_info.append(dict_data)

    total = data['props']['pageProps']['initialState']['products']['total']
    total_page = math.ceil(total/int(params['pagingSize']))
        
        # 딕셔너리의 리스트를 데이터프레임으로 변환합니다.
    df = pd.DataFrame(products_info)
    
    return df,total_page

def makeCategoryDF(category_id):

    
    df,total_page = makePageDF(category_id,1)
        
    if total_page >1:
        for i in range(2,total_page+1):
            new_df, _ = makePageDF(category_id,i)
            time.sleep(0.3)
            df = pd.concat([df, new_df], ignore_index=True)
    
    df = df.drop_duplicates(subset=['ID'])

    df = pd.DataFrame(df).astype({'제조사': 'category'})
    df = pd.DataFrame(df).astype({'브랜드': 'category'})

    attribute_columns = df.columns.drop(['ID', '상품명', '상품 카테고리 대분류', '상품 카테고리 중분류','상품 카테고리 소분류','제조사','브랜드','특징'])
    df[attribute_columns] = df[attribute_columns].fillna(False).astype('bool')
    
    small_df = df.explode('특징')[['ID','상품명','특징']]
    attributes = pd.DataFrame(small_df['특징'].unique()).reset_index()
    attributes.columns = ['attribute_id', '특징']
    attributes = pd.DataFrame(attributes).astype({'attribute_id': 'category'})
    merged_df = pd.merge(small_df, attributes, on=['특징'])
    
    return df, merged_df, attributes

In [84]:
df, merged_df, attributes = makeCategoryDF(100001011)

https://search.shopping.naver.com/search/category/100001011?pagingIndex=1&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=2&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=3&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=4&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=5&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=6&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=7&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=8&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=9&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/10000

In [28]:
df

Unnamed: 0,ID,상품명,상품 카테고리 대분류,상품 카테고리 중분류,상품 카테고리 소분류,제조사,브랜드,특징,샴푸,천연샴푸,...,임산부겸용,보색샴푸,쿨샴푸,쉐도우형,무스형,스틱형,염색 모발용,"시트,티슈형",브러쉬형,파우더형
0,28269721554,검은콩 딥 스케일링 나이트 샴푸 로즈마리향 400ml,화장품/미용,헤어케어,샴푸,동방코스메틱,올솔,"[샴푸, 천연샴푸, 비듬샴푸, 두피샴푸, 한방샴푸, 퍼퓸샴푸, 펌프형, 저자극, 촉...",True,True,...,False,False,False,False,False,False,False,False,False,False
1,28269744554,검은콩 보습 진정 모닝 샴푸 로즈마리향 400ml,화장품/미용,헤어케어,샴푸,동방코스메틱,올솔,"[샴푸, 모든두피용, 모든 모발용, 일반샴푸, 펌프형, 머릿결개선, 세정력, 풍부한...",True,False,...,False,False,False,False,False,False,False,False,False,False
2,29502248618,어글리솝 더 님 민감성용 97g,화장품/미용,헤어케어,샴푸,프롬더랜드,프롬더랜드,"[샴푸, 일반샴푸, 머릿결개선, 세정력, 풍부한 거품, 저자극, 촉촉함(수분공급),...",True,False,...,False,False,False,False,False,False,False,False,False,False
3,20991784508,녹차실감 산뜻한 타입 지성 모발용 샴푸 500g,화장품/미용,헤어케어,샴푸,아모스,아모스,"[샴푸, 지성, 두피샴푸, 펌프형, 세정력, 상쾌함, 저자극, 촉촉함(수분공급), ...",True,False,...,False,False,False,False,False,False,False,False,False,False
4,13023412774,EM 발효 다시마 천연샴푸 500ml,화장품/미용,헤어케어,샴푸,이엔에스코리아,청미정,"[샴푸, 모든두피용, 모든 모발용, 천연샴푸, 두피샴푸, 펌프형, 세정력, 풍부한 ...",True,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11068,34388124618,딥 클렌징 샴푸 250ml,화장품/미용,헤어케어,샴푸,뉴샤,뉴샤,"[샴푸, 일반샴푸, 로션형, 머릿결개선, 세정력, 풍부한 거품, 상쾌함, 저자극, ...",True,False,...,False,False,False,False,False,False,False,False,False,False
11073,34697465621,마일드 워시 스칼프 샴푸 300ml(리필),화장품/미용,헤어케어,샴푸,네이처랩,마로17,"[샴푸, 일반샴푸, 세정력, 풍부한 거품]",True,False,...,False,False,False,False,False,False,False,False,False,False
11083,34857233618,샴푸 500ml,화장품/미용,헤어케어,샴푸,UNKNOWN,지파동수,"[샴푸, 일반샴푸]",True,False,...,False,False,False,False,False,False,False,False,False,False
11122,14925376275,헤나 샴푸 400ml,화장품/미용,헤어케어,샴푸,,더수자타,"[샴푸, 촉촉함(수분공급)]",True,False,...,False,False,False,False,False,False,False,False,False,False


In [29]:
merged_df

Unnamed: 0,ID,상품명,특징,attribute_id
0,28269721554,검은콩 딥 스케일링 나이트 샴푸 로즈마리향 400ml,샴푸,0
1,28269744554,검은콩 보습 진정 모닝 샴푸 로즈마리향 400ml,샴푸,0
2,29502248618,어글리솝 더 님 민감성용 97g,샴푸,0
3,20991784508,녹차실감 산뜻한 타입 지성 모발용 샴푸 500g,샴푸,0
4,13023412774,EM 발효 다시마 천연샴푸 500ml,샴푸,0
...,...,...,...,...
46183,40330480626,세린 스칼프 오일 컨트롤 샴푸 파우더 45g,파우더형,85
46184,32437168621,노 드라우트 드라이 샴푸 115g,파우더형,85
46185,35359117623,스템루텐스 퍼퓸 샴푸 스프링 오브 프리지아 500ml,파우더형,85
46186,39512968626,탭 시크릿 7g,파우더형,85


In [30]:
attributes

Unnamed: 0,attribute_id,특징
0,0,샴푸
1,1,천연샴푸
2,2,비듬샴푸
3,3,두피샴푸
4,4,한방샴푸
...,...,...
81,81,스틱형
82,82,염색 모발용
83,83,"시트,티슈형"
84,84,브러쉬형


In [85]:
df.index[-1],len(df),df.index[-1]-len(df)

(5635, 5516, 119)

In [77]:
def makePageDF(category_id, page_id, pagingsize):
    url = 'https://search.shopping.naver.com/search/category/' + str(category_id)
    params = {
        'pagingIndex' : str(page_id),
        'pagingSize' : str(pagingsize),
        'productSet' : 'model'
    }
    headers = {'User-Agent': 'Yeti',}
    resp = request('GET',url=url, params=params, headers=headers)
    print(resp.request.url)
    dom = BeautifulSoup(resp.text, 'html.parser')
    script_tag = dom.find("script", {"id": "__NEXT_DATA__"})
    data = json.loads(script_tag.text)
    itemList = data['props']['pageProps']['initialState']['products']
    
    # 각 상품의 정보를 저장할 리스트를 생성합니다.
    products_info = []

    for item in itemList['list'][:pagingsize]:

        attributes_dict = dict()
        # 'attributeValue'와 'characterValue' 문자열을 리스트로 분리합니다.
        attribute_values = item['item'].get('attributeValue','').split('|')
        character_values = item['item'].get('characterValue','').split('|')

        # 'attributeValue' 리스트의 각 요소에서 '_M' 문자열을 제거합니다.
        attribute_values = [value.replace('_M', '') for value in attribute_values]

        min_len = min(len(attribute_values), len(character_values))

        for j in range(min_len):
            attribute = attribute_values[j]
            character = character_values[j]

            if attribute in attributes_dict:
                attributes_dict[attribute].append(character)
            else:
                attributes_dict[attribute] = [character]

        attribute_list = [item['item']['category3Name']]

        for i in attributes_dict:
            if i not in ['용량', '구성', '']:
                attribute_list = attribute_list + attributes_dict[i]
            else:
                continue

        dict_data = {
            'ID': item['item']['id'],
            '상품명': item['item']['productName'],
            '상품 카테고리 대분류': item['item']['category1Name'],
            '상품 카테고리 중분류': item['item']['category2Name'],
            '상품 카테고리 소분류': item['item']['category3Name'],
            '제조사': item['item'].get('maker',''),
            '브랜드': item['item'].get('brand',''),
            '특징': attribute_list
        }
        for attribute in attribute_list:
            dict_data[attribute] = True

        products_info.append(dict_data)

    total = data['props']['pageProps']['initialState']['products']['total']
    total_page = math.ceil(total/pagingsize)

        # 딕셔너리의 리스트를 데이터프레임으로 변환합니다.
    df = pd.DataFrame(products_info)
    
    return df,total_page

def makeCategoryDF(category_id):    
    df = pd.DataFrame()
    
    for j in [20,40,60,80]:
        df,total_page = makePageDF(category_id,1,j)
        for i in range(2,total_page+1):
            new_df,_ = makePageDF(category_id,i,j)
            time.sleep(0.5)
            df = pd.concat([df, new_df], ignore_index=True)
    
    df = df.drop_duplicates(subset=['ID'])
    df = pd.DataFrame(df).astype({'제조사': 'category'})
    df = pd.DataFrame(df).astype({'브랜드': 'category'})
    attribute_columns = df.columns.drop(['ID', '상품명', '상품 카테고리 대분류', '상품 카테고리 중분류','상품 카테고리 소분류','제조사','브랜드','특징'])
    df[attribute_columns] = df[attribute_columns].fillna(False).astype('bool')
    small_df = df.explode('특징')[['ID','상품명','특징']]
    attributes = pd.DataFrame(small_df['특징'].unique()).reset_index()
    attributes.columns = ['attribute_id', '특징']
    attributes = pd.DataFrame(attributes).astype({'attribute_id': 'category'})
    merged_df = pd.merge(small_df, attributes, on=['특징'])
    
    return df, merged_df, attributes

In [78]:
df, merged_df, attributes = makeCategoryDF(100001011)

https://search.shopping.naver.com/search/category/100001011?pagingIndex=1&pagingSize=20&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=2&pagingSize=20&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=3&pagingSize=20&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=4&pagingSize=20&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=5&pagingSize=20&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=6&pagingSize=20&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=7&pagingSize=20&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=8&pagingSize=20&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=9&pagingSize=20&productSet=model
https://search.shopping.naver.com/search/category/10000

In [79]:
df.index[-1],len(df),df.index[-1]-len(df)

(5634, 5562, 72)

In [80]:
def makePageDF(category_id, page_id, pagingsize):
    url = 'https://search.shopping.naver.com/search/category/' + str(category_id)
    params = {
        'pagingIndex' : str(page_id),
        'pagingSize' : str(pagingsize),
        'productSet' : 'model'
    }
    headers = {'User-Agent': 'Yeti',}
    resp = request('GET',url=url, params=params, headers=headers)
    print(resp.request.url)
    dom = BeautifulSoup(resp.text, 'html.parser')
    script_tag = dom.find("script", {"id": "__NEXT_DATA__"})
    data = json.loads(script_tag.text)
    itemList = data['props']['pageProps']['initialState']['products']
    
    # 각 상품의 정보를 저장할 리스트를 생성합니다.
    products_info = []

    for item in itemList['list'][:pagingsize]:

        attributes_dict = dict()
        # 'attributeValue'와 'characterValue' 문자열을 리스트로 분리합니다.
        attribute_values = item['item'].get('attributeValue','').split('|')
        character_values = item['item'].get('characterValue','').split('|')

        # 'attributeValue' 리스트의 각 요소에서 '_M' 문자열을 제거합니다.
        attribute_values = [value.replace('_M', '') for value in attribute_values]

        min_len = min(len(attribute_values), len(character_values))

        for j in range(min_len):
            attribute = attribute_values[j]
            character = character_values[j]

            if attribute in attributes_dict:
                attributes_dict[attribute].append(character)
            else:
                attributes_dict[attribute] = [character]

        attribute_list = [item['item']['category3Name']]

        for i in attributes_dict:
            if i not in ['용량', '구성', '']:
                attribute_list = attribute_list + attributes_dict[i]
            else:
                continue

        dict_data = {
            'ID': item['item']['id'],
            '상품명': item['item']['productName'],
            '상품 카테고리 대분류': item['item']['category1Name'],
            '상품 카테고리 중분류': item['item']['category2Name'],
            '상품 카테고리 소분류': item['item']['category3Name'],
            '제조사': item['item'].get('maker',''),
            '브랜드': item['item'].get('brand',''),
            '특징': attribute_list
        }
        for attribute in attribute_list:
            dict_data[attribute] = True

        products_info.append(dict_data)

    total = data['props']['pageProps']['initialState']['products']['total']
    total_page = math.ceil(total/pagingsize)

        # 딕셔너리의 리스트를 데이터프레임으로 변환합니다.
    df = pd.DataFrame(products_info)
    
    return df,total_page

def makeCategoryDF(category_id):    
    df = pd.DataFrame()
    
    for j in [60,80]:
        df,total_page = makePageDF(category_id,1,j)
        for i in range(2,total_page+1):
            new_df,_ = makePageDF(category_id,i,j)
            time.sleep(0.3)
            df = pd.concat([df, new_df], ignore_index=True)
    
    df = df.drop_duplicates(subset=['ID'])
    df = pd.DataFrame(df).astype({'제조사': 'category'})
    df = pd.DataFrame(df).astype({'브랜드': 'category'})
    attribute_columns = df.columns.drop(['ID', '상품명', '상품 카테고리 대분류', '상품 카테고리 중분류','상품 카테고리 소분류','제조사','브랜드','특징'])
    df[attribute_columns] = df[attribute_columns].fillna(False).astype('bool')
    small_df = df.explode('특징')[['ID','상품명','특징']]
    attributes = pd.DataFrame(small_df['특징'].unique()).reset_index()
    attributes.columns = ['attribute_id', '특징']
    attributes = pd.DataFrame(attributes).astype({'attribute_id': 'category'})
    merged_df = pd.merge(small_df, attributes, on=['특징'])
    
    return df, merged_df, attributes

In [81]:
df, merged_df, attributes = makeCategoryDF(100001011)

https://search.shopping.naver.com/search/category/100001011?pagingIndex=1&pagingSize=60&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=2&pagingSize=60&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=3&pagingSize=60&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=4&pagingSize=60&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=5&pagingSize=60&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=6&pagingSize=60&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=7&pagingSize=60&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=8&pagingSize=60&productSet=model
https://search.shopping.naver.com/search/category/100001011?pagingIndex=9&pagingSize=60&productSet=model
https://search.shopping.naver.com/search/category/10000

In [82]:
df.index[-1],len(df),df.index[-1]-len(df)

(5634, 5556, 78)

In [115]:
from urllib.robotparser import RobotFileParser
robot = RobotFileParser('https://shopping.naver.com/robots.txt') # robots.txt 파일을 파싱해서 크롤링 여부를 판단해줌
robot.read()
robot.can_fetch('Yeti','/search/category')

True

# 카테고리 확인

In [117]:
from pprint import pprint
url = 'https://search.shopping.naver.com/search/category/100000003'
params = {
    'pagingIndex' : '',
    'pagingSize' : '80',
    'productSet' : 'model'
}
headers = {'User-Agent': 'Yeti',}
resp = request('GET',url=url, headers=headers)
print(resp.request.url)

dom = BeautifulSoup(resp.text, 'html.parser')
script_tag = dom.find("script", {"id": "__NEXT_DATA__"})
data = json.loads(script_tag.text)
itemList = data['props']['pageProps']['initialState']['products']

#print(dom.prettify())
pprint(data)

https://search.shopping.naver.com/search/category/100000003
{'assetPrefix': 'https://ssl.pstatic.net/shoppingsearch/static/pc/pc-230803-112911',
 'buildId': 'p2RdUPor_kjbn0ilNfl8D',
 'dynamicIds': [91117, 53502],
 'gssp': True,
 'isFallback': False,
 'page': '/search/category/[categoryId]',
 'props': {'__N_SSP': True,
           'pageProps': {'banners': [{'dataSeq': '1',
                                      'imageUrl': 'https://ssl.pstatic.net/shoppingsearch/static/icons/grocery/grocery_normal_banner_pc.png',
                                      'mallSeq': '99538',
                                      'modifyDate': '',
                                      'newWindow': True,
                                      'targetUrl': 'https://nid.naver.com/membership/join',
                                      'templateSeq': '1',
                                      'title': '멤버십 회원이라면, 장보기 최대 10% 적립'}],
                         'bucketId': '',
                         'cartProductCount': 

In [108]:
title_value_pairs = [{'title': x['title'], 'value': x['value']} for x in data['props']['pageProps']['initialState']['mainFilters'][0]['filterValues']]
category_df = pd.DataFrame(title_value_pairs)
category_df

Unnamed: 0,title,value
0,서머뷰티,100008810
1,비건,100000926
2,스킨케어,100000913
3,바디케어,100000920
4,헤어케어,100000921
5,향수,100000923
6,핸드케어,100001000
7,클렌징,100000915
8,베이스메이크업,100000917
9,뷰티소품,100000924


In [129]:
from pprint import pprint
url = 'https://search.shopping.naver.com/search/category/'
params = {
    'pagingIndex' : '1',
    'pagingSize' : '80',
    'productSet' : 'model'
}
headers = {'User-Agent': 'Yeti',}

for index, row in category_df.iterrows():
    category_url = url + str(row['value'])
    resp = request('GET',url=category_url,params=params, headers=headers)
    print(resp.request.url)
    dom = BeautifulSoup(resp.text, 'html.parser')
    script_tag = dom.find("script", {"id": "__NEXT_DATA__"})
    data = json.loads(script_tag.text)
    product_count = data['props']['pageProps']['initialState']['subFilters'][0]['filterValues'][1]['productCount']
    if(product_count/int(params['pagingSize'])>100):
        print(row['title'],row['value'],product_count)

#모시우디 는 가격비교가 0!

https://search.shopping.naver.com/search/category/100008810?pagingIndex=1&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/100000926?pagingIndex=1&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/100000913?pagingIndex=1&pagingSize=80&productSet=model
스킨케어 100000913 34774
https://search.shopping.naver.com/search/category/100000920?pagingIndex=1&pagingSize=80&productSet=model
바디케어 100000920 23068
https://search.shopping.naver.com/search/category/100000921?pagingIndex=1&pagingSize=80&productSet=model
헤어케어 100000921 15763
https://search.shopping.naver.com/search/category/100000923?pagingIndex=1&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/100001000?pagingIndex=1&pagingSize=80&productSet=model
https://search.shopping.naver.com/search/category/100000915?pagingIndex=1&pagingSize=80&productSet=model
클렌징 100000915 10062
https://search.shopping.naver.com/search/category/100000917?pagingIndex=1&pag

IndexError: list index out of range

In [135]:
url='https://search.shopping.naver.com/search/category/100008881?pagingIndex=1&pagingSize=80&productSet=model'
resp = request('GET',url=url,headers=headers)
print(resp.request.url)
dom = BeautifulSoup(resp.text, 'html.parser')
script_tag = dom.find("script", {"id": "__NEXT_DATA__"})
data = json.loads(script_tag.text)
pprint(data)

https://search.shopping.naver.com/search/category/100008881?pagingIndex=1&pagingSize=80&productSet=model
{'assetPrefix': 'https://ssl.pstatic.net/shoppingsearch/static/pc/pc-230803-112911',
 'buildId': 'p2RdUPor_kjbn0ilNfl8D',
 'dynamicIds': [53502],
 'gssp': True,
 'isFallback': False,
 'page': '/search/category/[categoryId]',
 'props': {'__N_SSP': True,
           'pageProps': {'banners': None,
                         'bucketId': '',
                         'cartProductCount': 0,
                         'dateStatus': {'date': '2023-08-08T04:00:31.493Z',
                                        'dayOfWeek': '화',
                                        'dayOfWeekIndex': 2,
                                        'isHoliday': False,
                                        'isWeekend': False},
                         'deliveryAddress': {'addressName': '',
                                             'baseAddress': '서울특별시 강남구 테헤란로 '
                                                     