In [1]:
import os
import sys
import urllib.request
import json
import pandas as pd
from dotenv import load_dotenv

In [2]:
load_dotenv()
client_id = os.getenv('NAVER_API_CLIENT_ID')
client_secret = os.getenv('NAVER_API_CLIENT_SECRET')

In [3]:
def get_search_url(search_txt, start_pg, disp_num):
    base = 'https://openapi.naver.com/v1/search/shop.json'
    query = '?query=' + urllib.parse.quote(search_txt)
    start = f'&start={start_pg}'
    disp = f'&display={disp_num}'
    url = base + query + disp + start    
    return url 

In [4]:
def get_search_result(url):
    request = urllib.request.Request(url)
    request.add_header('X-Naver-Client-Id', client_id)
    request.add_header('X-Naver-Client-Secret', client_secret)
    response = urllib.request.urlopen(request)
    return json.loads(response.read().decode('utf-8'))

In [5]:
def delete_tag(input_str):
    input_str = input_str.replace('<b>', '')
    input_str = input_str.replace('</b>', '')
    input_str = input_str.replace('\xa0', '')
    return input_str

In [6]:
def get_frame(json_data):
    title = [delete_tag(each['title']) for each in json_data['items']]
    lprice = [each['lprice'] for each in json_data['items']]
    link = [each['link'] for each in json_data['items']]
    mall_name = [each['mallName'] for each in json_data['items']]
    product_type = [each['productType'] for each in json_data['items']]
    brand = [each['brand'] for each in json_data['items']]
    category1 = [each['category1'] for each in json_data['items']]
    category2 = [each['category2'] for each in json_data['items']]
    category3 = [each['category3'] for each in json_data['items']]
    category4 = [each['category4'] for each in json_data['items']]
       
    df = pd.DataFrame({
        'title':title,
        'lprice':lprice,
        'link':link,
        'mall':mall_name,
        'category':product_type,
        'brand':brand,
        'category1':category1,
        'category2':category2,
        'category3':category3,
        'category4':category4,        
    }, columns=['title', 'lprice', 'link', 'mall', 'brand', 'category', 'category1', 'category2', 'category3', 'category4'])
    
    return df

In [21]:
url = get_search_url('과일 복숭아',1,100) # 반복문 돌려야함
json_data = get_search_result(url)
df = get_frame(json_data)

In [22]:
df.head()

Unnamed: 0,title,lprice,link,mall,brand,category,category1,category2,category3,category4
0,하우스 겨울 복숭아 설아 설도 납작복숭아 대극천 신비복숭아 딱딱이 말랑이 천도 백도...,38900,https://smartstore.naver.com/main/products/560...,맑은 숨,햇사레,2,식품,농산물,과일,복숭아
1,복숭아 납작복숭아 신비복숭아 하우스 대극천 거반도 빅숭아 천도 신선 백도 말랑이 딱딱이,34800,https://smartstore.naver.com/main/products/497...,데일리팜 dailyfarm,,2,식품,농산물,과일,복숭아
2,설아복숭아 겨울복숭아 임산부 한설빛 설복숭아 쫀득 가을 아삭이 딱딱이 딱복,69800,https://smartstore.naver.com/main/products/110...,프루츠윗,,2,식품,농산물,과일,복숭아
3,설아복숭아 겨울 복숭아 황귀비 양홍장 복숭아 햇사레 백도 물복 2kg,79900,https://smartstore.naver.com/main/products/890...,보트레푸드,,2,식품,농산물,과일,복숭아
4,"아침 수확 새벽 도착, 햇사레 말랑이 복숭아 3kg (일반/고당도)",35900,https://www.onbrix.co.kr/front/product/1029,온브릭스,온브릭스,2,식품,농산물,과일,복숭아


In [23]:
df['lprice'] = df['lprice'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      100 non-null    object
 1   lprice     100 non-null    int32 
 2   link       100 non-null    object
 3   mall       100 non-null    object
 4   brand      100 non-null    object
 5   category   100 non-null    object
 6   category1  100 non-null    object
 7   category2  100 non-null    object
 8   category3  100 non-null    object
 9   category4  100 non-null    object
dtypes: int32(1), object(9)
memory usage: 7.6+ KB


In [24]:
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
vector = TfidfVectorizer()
vector = CountVectorizer()
a = df['title'] + ' ' + df['mall'] + ' ' + df['brand'] + ' ' + df['category'] + ' ' + df['category1'] + ' ' + df['category2'] + ' ' + df['category3'] + ' ' + df['category4']  
matrix = vector.fit_transform(a)

In [26]:
user_query = '당도 복숭아 피자두 달다 향 식감'
user_tfidf = vector.transform([user_query])

In [27]:
similarity_scores = cosine_similarity(user_tfidf, matrix).flatten()

In [28]:
df['similarity'] = similarity_scores
recommendations = df.sort_values(by='similarity', ascending=False)
recommendations

Unnamed: 0,title,lprice,link,mall,brand,category,category1,category2,category3,category4,similarity
30,[제철과일] 인생 복숭아 청도 갤럭시농장 복숭아[복숭아 4kg],20900,http://mallscm.epost.go.kr/postif/epostPartner...,우체국쇼핑,,2,식품,농산물,과일,복숭아,0.800000
75,제철 과일 황도 복숭아 4kg 고당도 복숭아 선물 세트 말랑이 복숭아,59400,https://smartstore.naver.com/main/products/107...,구미당 스토어,,2,식품,농산물,과일,복숭아,0.718421
27,석가과일 복숭아,24730,https://link.coupang.com/re/PCSNAVERPCSDP?page...,쿠팡,,2,식품,농산물,과일,복숭아,0.666667
40,과일모형 복숭아,1800,https://link.gmarket.co.kr/gate/pcs?item-no=18...,G마켓,,2,식품,농산물,과일,복숭아,0.666667
3,설아복숭아 겨울 복숭아 황귀비 양홍장 복숭아 햇사레 백도 물복 2kg,79900,https://smartstore.naver.com/main/products/890...,보트레푸드,,2,식품,농산물,과일,복숭아,0.654654
...,...,...,...,...,...,...,...,...,...,...,...
28,유럽에서 먹었던 그 맛 납작복숭아 급냉 산지직송 고당도 높은 브릭스,19800,https://smartstore.naver.com/main/products/111...,작심스토어,,2,식품,농산물,과일,복숭아,0.277350
37,더리얼 한입 반건조 말랑복숭아 건조과일 말랭이,1900,https://smartstore.naver.com/main/products/636...,담둥마켓,더리얼,2,식품,농산물,과일,복숭아,0.267261
5,12월08일 마감 대설백도(겨울복숭아),990038000,https://smartstore.naver.com/main/products/752...,겨울복숭아 -대설백도,,2,식품,농산물,과일,복숭아,0.267261
48,더리얼 한입 반건조 말랑복숭아 10팩 건조과일,19800,https://smartstore.naver.com/main/products/636...,담둥마켓,더리얼,2,식품,농산물,과일,복숭아,0.267261


In [29]:
recommendations.to_csv('./recommend_test.csv', index=False, encoding='utf-8-sig')

In [280]:
df = pd.DataFrame({}, columns=['title', 'lprice', 'link', 'mall', 'brand', 'category', 'category1', 'category2', 'category3', 'category4']) 
for idx in range(1,100):
    url = get_search_url('사과', idx, 100)
    json_data = get_search_result(url)
    try_df = get_frame(json_data)
    df = pd.concat([df, try_df])

KeyboardInterrupt: 

In [225]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9900 entries, 0 to 99
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      9900 non-null   object
 1   lprice     9900 non-null   object
 2   link       9900 non-null   object
 3   mall       9900 non-null   object
 4   brand      9900 non-null   object
 5   category   9900 non-null   object
 6   category1  9900 non-null   object
 7   category2  9900 non-null   object
 8   category3  9900 non-null   object
 9   category4  9900 non-null   object
dtypes: object(10)
memory usage: 850.8+ KB


In [229]:
df = df.astype({"title" : "str", "lprice" : "int", 'category' : 'str'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9900 entries, 0 to 99
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      9900 non-null   object
 1   lprice     9900 non-null   int32 
 2   link       9900 non-null   object
 3   mall       9900 non-null   object
 4   brand      9900 non-null   object
 5   category   9900 non-null   object
 6   category1  9900 non-null   object
 7   category2  9900 non-null   object
 8   category3  9900 non-null   object
 9   category4  9900 non-null   object
dtypes: int32(1), object(9)
memory usage: 812.1+ KB
