In [298]:
import os
import sys
import urllib.request
import json
import pandas as pd
from dotenv import load_dotenv

In [299]:
load_dotenv()
client_id = os.getenv('NAVER_API_CLIENT_ID')
client_secret = os.getenv('NAVER_API_CLIENT_SECRET')

In [300]:
def get_search_url(search_txt, start_pg, disp_num):
    base = 'https://openapi.naver.com/v1/search/shop.json'
    query = '?query=' + urllib.parse.quote(search_txt)
    start = f'&start={start_pg}'
    disp = f'&display={disp_num}'
    url = base + query + disp + start    
    return url 

In [301]:
def get_search_result(url):
    request = urllib.request.Request(url)
    request.add_header('X-Naver-Client-Id', client_id)
    request.add_header('X-Naver-Client-Secret', client_secret)
    response = urllib.request.urlopen(request)
    return json.loads(response.read().decode('utf-8'))

In [302]:
def delete_tag(input_str):
    input_str = input_str.replace('<b>', '')
    input_str = input_str.replace('</b>', '')
    input_str = input_str.replace('\xa0', '')
    return input_str

In [303]:
def get_frame(json_data):
    title = [delete_tag(each['title']) for each in json_data['items']]
    lprice = [each['lprice'] for each in json_data['items']]
    link = [each['link'] for each in json_data['items']]
    mall_name = [each['mallName'] for each in json_data['items']]
    product_type = [each['productType'] for each in json_data['items']]
    brand = [each['brand'] for each in json_data['items']]
    category1 = [each['category1'] for each in json_data['items']]
    category2 = [each['category2'] for each in json_data['items']]
    category3 = [each['category3'] for each in json_data['items']]
    category4 = [each['category4'] for each in json_data['items']]
       
    df = pd.DataFrame({
        'title':title,
        'lprice':lprice,
        'link':link,
        'mall':mall_name,
        'category':product_type,
        'brand':brand,
        'category1':category1,
        'category2':category2,
        'category3':category3,
        'category4':category4,        
    }, columns=['title', 'lprice', 'link', 'mall', 'brand', 'category', 'category1', 'category2', 'category3', 'category4'])
    
    return df

In [304]:
url = get_search_url('화장품',1,50) # 반복문 돌려야함
json_data = get_search_result(url)
df = get_frame(json_data)

In [305]:
df.head()

Unnamed: 0,title,lprice,link,mall,brand,category,category1,category2,category3,category4
0,설화수 자음 2종 세트,38210,https://search.shopping.naver.com/catalog/3008...,네이버,설화수,1,화장품/미용,스킨케어,화장품세트,
1,설화수 탄력 크림 75ml,36690,https://search.shopping.naver.com/catalog/4157...,네이버,설화수,1,화장품/미용,스킨케어,크림,
2,키엘 울트라 훼이셜 크림 125ml,19990,https://search.shopping.naver.com/catalog/2109...,네이버,키엘,1,화장품/미용,스킨케어,크림,
3,인셀덤 기초 5종 세트,101590,https://search.shopping.naver.com/catalog/2838...,네이버,인셀덤,1,화장품/미용,스킨케어,화장품세트,
4,닥터지 레드 블레미쉬 클리어 크림 70ml,5580,https://search.shopping.naver.com/catalog/8863...,네이버,닥터지,1,화장품/미용,스킨케어,크림,


In [306]:
df['lprice'] = df['lprice'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      50 non-null     object
 1   lprice     50 non-null     int32 
 2   link       50 non-null     object
 3   mall       50 non-null     object
 4   brand      50 non-null     object
 5   category   50 non-null     object
 6   category1  50 non-null     object
 7   category2  50 non-null     object
 8   category3  50 non-null     object
 9   category4  50 non-null     object
dtypes: int32(1), object(9)
memory usage: 3.8+ KB


In [307]:
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [320]:
vector = TfidfVectorizer()
vector = CountVectorizer()
a = df['title'] + ' ' + df['mall'] + ' ' + df['brand'] + ' ' + df['category'] + ' ' + df['category1'] + ' ' + df['category2'] + ' ' + df['category3'] + ' ' + df['category4']  
matrix = vector.fit_transform(a)

In [321]:
user_query = '탄력 크림 에센스 시카'
user_tfidf = vector.transform([user_query])

In [322]:
similarity_scores = cosine_similarity(user_tfidf, matrix).flatten()

In [323]:
df['similarity'] = similarity_scores
recommendations = df.sort_values(by='similarity', ascending=False)
recommendations

Unnamed: 0,title,lprice,link,mall,brand,category,category1,category2,category3,category4,similarity
1,설화수 탄력 크림 75ml,36690,https://search.shopping.naver.com/catalog/4157...,네이버,설화수,1,화장품/미용,스킨케어,크림,,0.400892
31,코스놀로지 덱스판테놀 시카 진정 수분앰플 에센스 수딩젤 대용량 150ml,18900,https://smartstore.naver.com/main/products/105...,코스놀로지,코스놀로지,2,화장품/미용,스킨케어,에센스,,0.312772
43,리채움 더 굿 크림 포 모이스처라이징 80ml,29000,https://search.shopping.naver.com/catalog/5139...,네이버,,1,화장품/미용,스킨케어,크림,,0.301511
6,센텔리안24 더 마데카 크림 50ml,6540,https://search.shopping.naver.com/catalog/2981...,네이버,센텔리안24,1,화장품/미용,스킨케어,크림,,0.267261
41,코스알엑스 더 레티놀 0.1 크림 20ml,12500,https://search.shopping.naver.com/catalog/3381...,네이버,코스알엑스,1,화장품/미용,스킨케어,크림,,0.267261
9,에스트라 아토베리어365 크림 80ml,16410,https://search.shopping.naver.com/catalog/1772...,네이버,에스트라,1,화장품/미용,스킨케어,크림,,0.267261
13,스킨톡 마녀 크림 100ml,16800,https://search.shopping.naver.com/catalog/3024...,네이버,스킨톡,1,화장품/미용,스킨케어,크림,,0.267261
48,제로이드 수딩 크림 160ml,38400,https://search.shopping.naver.com/catalog/1015...,네이버,제로이드,1,화장품/미용,스킨케어,크림,,0.267261
47,키엘 울트라 훼이셜 크림 50ml,19800,https://search.shopping.naver.com/catalog/4048...,네이버,키엘,1,화장품/미용,스킨케어,크림,,0.258199
2,키엘 울트라 훼이셜 크림 125ml,19990,https://search.shopping.naver.com/catalog/2109...,네이버,키엘,1,화장품/미용,스킨케어,크림,,0.258199


In [280]:
df = pd.DataFrame({}, columns=['title', 'lprice', 'link', 'mall', 'brand', 'category', 'category1', 'category2', 'category3', 'category4']) 
for idx in range(1,100):
    url = get_search_url('사과', idx, 100)
    json_data = get_search_result(url)
    try_df = get_frame(json_data)
    df = pd.concat([df, try_df])

KeyboardInterrupt: 

In [225]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9900 entries, 0 to 99
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      9900 non-null   object
 1   lprice     9900 non-null   object
 2   link       9900 non-null   object
 3   mall       9900 non-null   object
 4   brand      9900 non-null   object
 5   category   9900 non-null   object
 6   category1  9900 non-null   object
 7   category2  9900 non-null   object
 8   category3  9900 non-null   object
 9   category4  9900 non-null   object
dtypes: object(10)
memory usage: 850.8+ KB


In [229]:
df = df.astype({"title" : "str", "lprice" : "int", 'category' : 'str'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9900 entries, 0 to 99
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      9900 non-null   object
 1   lprice     9900 non-null   int32 
 2   link       9900 non-null   object
 3   mall       9900 non-null   object
 4   brand      9900 non-null   object
 5   category   9900 non-null   object
 6   category1  9900 non-null   object
 7   category2  9900 non-null   object
 8   category3  9900 non-null   object
 9   category4  9900 non-null   object
dtypes: int32(1), object(9)
memory usage: 812.1+ KB
