# 앱 스토어 상품 추천 서비스를 위한 전처리
 - 데이터는 https://www.kaggle.com/usernam3/shopify-app-store 이 데이터를 사용하였습니다.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import warnings
import time
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
apps_categories = pd.read_csv('./dataset/apps_categories.csv',delimiter = None,encoding='utf-8')
print("apps_categories : ",apps_categories.shape)

apps = pd.read_csv('./dataset/apps.csv',delimiter = None,encoding='utf-8')
print("apps : ",apps.shape)

categories = pd.read_csv('./dataset/categories.csv',delimiter = None,encoding='utf-8')
print("categories : ",categories.shape)

key_benefits = pd.read_csv('./dataset/key_benefits.csv',delimiter = None,encoding='utf-8')
print("key_benefits : ",key_benefits.shape)

pricing_plan_features = pd.read_csv('./dataset/pricing_plan_features.csv',delimiter = None,encoding='utf-8')
print("pricing_plan_features : ",pricing_plan_features.shape)

pricing_plans = pd.read_csv('./dataset/pricing_plans.csv',delimiter = None,encoding='utf-8')
print("pricing_plans : ",pricing_plans.shape)

reviews = pd.read_csv('./dataset/reviews.csv',delimiter = None,encoding='utf-8')
print("reviews : ",reviews.shape)

apps_categories :  (7376, 2)
apps :  (4750, 12)
categories :  (12, 2)
key_benefits :  (12927, 3)
pricing_plan_features :  (23990, 3)
pricing_plans :  (8514, 4)
reviews :  (447317, 8)


## 추천에 필요한 "상품" 데이터 준비하기

- apps에서 추천에 필요한 컬럼만 추출하기
  - 앱에 대한 묘사를 리스트 형태로 담기 위한 전처리 진행

In [3]:
start = time.time() 

app_info = apps[['id','title','rating','reviews_count','description','tagline']].rename(columns = {'id' : 'app_id','description':'app_description'})

for i in range(len(app_info)):
    # \n를 기준으로 리스트를 만든 다음, 스페이스바를 제거해줌
    temp_list = [n.strip() for n in app_info['app_description'].str.split('\n')[i]]
    # none값인 원소들을 삭제함
    app_info['app_description'][i] = list(filter(None, temp_list))

end = time.time()     
print(f"{end - start:.5f} sec")    
    
app_info

58.63104 sec


Unnamed: 0,app_id,title,rating,reviews_count,app_description,tagline
0,9e4748a9-7eda-4814-83b6-0537d44152b1,Panda Language Translate,4.7,379,[Panda Language Translation improve your busin...,Translate your store into multiple languages
1,d1476138-a608-4bb9-8d39-b30f3ca7617d,Instant Brand Page,4.9,13,[Having a brand page gives your shoppers a qui...,A-Z Brand Index Page and Favourites Slider
2,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,FAQ Accordion | Help Center,4.5,202,"[Create, organize and display Frequently Asked...","FAQ page, FAQ accordion menu for product Info ..."
3,0ef0087f-3ae5-4dbc-84e0-193b576d82ed,Promote Me | Many apps in one,4.9,18,[Promote Me app is a bundle of apps that inclu...,"Spin Wheel,Currency Converter,Quick ATC Button..."
4,7aac2a1f-ff03-4f38-aeb7-7619403a6f05,Instalify,0.0,0,[We are trusted by hundreds of Shopify and Sho...,Supercharge Your Mobile App Installs
...,...,...,...,...,...,...
4745,a2197ba7-c0d0-4954-9b37-09c7f27ec919,Request a Quote & Hide Prices,4.9,26,[Add “Get a Quote” button and your phone numbe...,Add “Get a Quote” button with custom contact f...
4746,28cd2f9d-61ea-447f-ae7d-e67440352d19,Product Options and Customizer,4.2,81,[FREE Installation Service. Just follow these ...,Create extra and unlimited product options (pr...
4747,f6fc54ce-9999-4180-8e7e-c899bbf30bbd,Custom Reviews,3.0,4,[Custom Reviews lets you add forms to your Sho...,Get valuable customer feedbacks on your products
4748,95d883a8-7dba-443d-a6ee-f018d4446985,Swatches by Webyze,4.4,158,[Swatches - Add color swatches to your Shopify...,Add stylish color/image Swatches to your produ...


- 카테고리 데이터 전처리
  - 카테고리는 라벨 인코더를 진행 
  - 앱 하나당 카테고리가 여러개일 수 있으므로 전처리 후 맵핑
  - 중복되는 카테고리는 최대 5개임

In [4]:
categories_match = pd.merge(left=apps_categories, right=categories, how='left', left_on='category_id', right_on='id')
categories_match = categories_match[['app_id','title']].rename(columns = {'title' : 'categories'})
categories_match

Unnamed: 0,app_id,categories
0,9e4748a9-7eda-4814-83b6-0537d44152b1,Store design
1,d1476138-a608-4bb9-8d39-b30f3ca7617d,Store design
2,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,Store design
3,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,Customer support
4,0ef0087f-3ae5-4dbc-84e0-193b576d82ed,Store design
...,...,...
7371,a2197ba7-c0d0-4954-9b37-09c7f27ec919,Sales and conversion optimization
7372,28cd2f9d-61ea-447f-ae7d-e67440352d19,Store design
7373,f6fc54ce-9999-4180-8e7e-c899bbf30bbd,Sales and conversion optimization
7374,95d883a8-7dba-443d-a6ee-f018d4446985,Store design


In [5]:
categories_encoder = LabelEncoder()
categories_encoder.fit(categories_match['categories'])
categories_match['categories'] = categories_encoder.transform(categories_match['categories'])
print(categories_match['categories'].unique())
print(categories_encoder.classes_)

# 인코더 저장
import pickle
with open('./encoder/categories_encoder.pickle','wb') as fw:
    pickle.dump(categories_encoder, fw)

[10  0  9  4  5  3  7  2  6  1  8 11]
['Customer support' 'Finances' 'Finding and adding products'
 'Inventory management' 'Marketing' 'Orders and shipping' 'Places to sell'
 'Productivity' 'Reporting' 'Sales and conversion optimization'
 'Store design' 'Trust and security']


In [6]:
categories_base = categories_match.drop_duplicates(['app_id']).reset_index(drop=True).astype(str)
# categories_temp = categories_temp.assign(categories_1=np.nan,
#                                         categories_2=np.nan,
#                                         categories_3=np.nan,
#                                         categories_4=np.nan,
#                                         categories_5=np.nan)
print(categories_base.shape)

(4750, 2)


In [7]:
# type을 object형으로 변환
categories_match = categories_match.astype(str)

# 카테고리 중복이 있는 앱만 추출
duplicate_categorie = categories_match.groupby(['app_id']).count()['categories'].loc[categories_match.groupby(['app_id']).count()['categories'] > 1]

length = len(duplicate_categorie)
app_id_group = categories_match.groupby(['app_id'])

# 카테고리 중복이 있는 앱의 카테고리들을 개별로 추출
for i in range(length):
    each_duplicate_categorie = app_id_group.get_group(duplicate_categorie.index[i])
    
    # 중복이 있는 개별 앱들의 카테고리를 합쳐서 temp변수에 담음 (','을 기준으로 합침)
    temp = ""
    for j in range(len(each_duplicate_categorie)):
        if j == len(each_duplicate_categorie) - 1:
            temp = temp + each_duplicate_categorie['categories'][each_duplicate_categorie.index[j]]
        else:    
            temp = temp + each_duplicate_categorie['categories'][each_duplicate_categorie.index[j]] + ','
    
    # temp변수에 담긴 카테고리들을 기존 카테고리 컬럼에 널기
    categories_base.loc[categories_base['app_id'] == duplicate_categorie.index[i],'categories'] = temp
    
# 원소를 리스트 형태로 저장    
# categories_base['categories'] = categories_base['categories'].str.split(',')    

categories_base

Unnamed: 0,app_id,categories
0,9e4748a9-7eda-4814-83b6-0537d44152b1,10
1,d1476138-a608-4bb9-8d39-b30f3ca7617d,10
2,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,100
3,0ef0087f-3ae5-4dbc-84e0-193b576d82ed,109
4,c13bfb7f-8b5a-40c6-a338-dbdec5cfd130,4
...,...,...
4745,a2197ba7-c0d0-4954-9b37-09c7f27ec919,9
4746,28cd2f9d-61ea-447f-ae7d-e67440352d19,10
4747,f6fc54ce-9999-4180-8e7e-c899bbf30bbd,9
4748,95d883a8-7dba-443d-a6ee-f018d4446985,10


- 앱에 대한 개발자 답변 데이터 전처리
    - 앱 Q&A의 개발자의 답변을 이용하기 위함
    - 답변 역시 중복이 있으므로 카테고리와 마찬가지로 전처리
    - 개발자 답변은 자연어 이므로 키워드 추출을 통해 상위 3개만 저장함

In [8]:
dev_key = key_benefits[['app_id','description']].rename(columns = {'description' : 'dev_answer'})
dev_key

Unnamed: 0,app_id,dev_answer
0,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,Add a custom FAQ accordion with answers to com...
1,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,Give your customers answers to refund policy q...
2,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,Reduce support time and answer common question...
3,9e4748a9-7eda-4814-83b6-0537d44152b1,Add 100+ languages in your store. Increase you...
4,0ef0087f-3ae5-4dbc-84e0-193b576d82ed,"Wheel of Fortune, Currency Converter, Animated..."
...,...,...
12922,95d883a8-7dba-443d-a6ee-f018d4446985,Choose between a variety of swatch themes.
12923,95d883a8-7dba-443d-a6ee-f018d4446985,No programming/coding skills are required. (Th...
12924,17918ea4-80b9-4811-947d-c0b2f950976a,Auto order confirmation via phone call & SMS f...
12925,17918ea4-80b9-4811-947d-c0b2f950976a,Reduce product return ratio and save your valu...


- 키워드 분석을 위해 영어로 번역
   - 데이터에는 다양한 언어들이 있음(중국어, 일본어 등)
- 키워드 추출을 위한 import

- TfidfVectorizer를 사용해 단어 분류
    - TF(Term Frequency) : 특정 단어가 하나의 데이터 안에서 등장하는 횟수
    - DF(Document Frequency) : 특정 단어가 여러 데이터에 자주 등장하는지를 알려주는 지표.
    - IDF(Inverse Document Frequency) : DF에 역수를 취해(inverse) 구함
    - TF-IDF : TF와 IDF를 곱한 값. 즉 TF가 높고, DF가 낮을수록 값이 커지는 것을 이용하는 것입니다.
    - 즉 해당 단위(문장) 안에서는 많이 등장하지만, 다른 문서들까지 전체에서는 적게 사용될수록, 분별력 있는 특징이란 것입니다.
- SentenceTransformer를 사용해 키워드 추출 (임베딩)

In [9]:
import googletrans
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [11]:
start = time.time() 

for k in range(len(dev_key)):  
        
    tfidfv = TfidfVectorizer().fit([dev_key['dev_answer'][k]])
    # print(tfidfv.transform([dev_key['dev_answer'][0]]).toarray())
    # print(tfidfv.vocabulary_)
    
    # 후보 키워드 추출
    candidates = tfidfv.get_feature_names()
    
    # 원래 문장과 후보 키워드를 인코딩
    doc_embedding = model.encode([dev_key['dev_answer'][k]])
    candidate_embeddings = model.encode(candidates)
    
    # 둘의 코사인 유사도를 계산해서 키워드 top3 추출
    top_n = 3
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
    
    # 추출한 키워드 삽입
    dev_key['dev_answer'][k] = ','.join(keywords)
    
    # 키워드 번역 (Google 번역 API는 초당 5회의 호출을 허용하고 하루에 20만 번으로 제한하므로 밑에다 넣음)
    # 시간관계상 번역 x
#     translator = googletrans.Translator() 
#     result = translator.translate(str(dev_key['dev_answer'][k]), dest='en')
#     dev_key['dev_answer'][k] = result.text
    
end = time.time()     
print(f"{end - start:.5f} sec")

dev_key

1264.02785 sec


Unnamed: 0,app_id,dev_answer
0,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,"custom,popular,accordion"
1,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,"frequently,refund,customers"
2,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,"page,reduce,cancellation"
3,9e4748a9-7eda-4814-83b6-0537d44152b1,"increase,sales,global"
4,0ef0087f-3ae5-4dbc-84e0-193b576d82ed,"button,fortune,checkout"
...,...,...
12922,95d883a8-7dba-443d-a6ee-f018d4446985,"swatch,themes,choose"
12923,95d883a8-7dba-443d-a6ee-f018d4446985,"installed,automatically,seamlessly"
12924,17918ea4-80b9-4811-947d-c0b2f950976a,"phone,cash,sms"
12925,17918ea4-80b9-4811-947d-c0b2f950976a,"save,valuable,reduce"


In [12]:
dev_key_base = dev_key.drop_duplicates(['app_id']).reset_index(drop=True)
print(dev_key_base.shape)

(4309, 2)


In [13]:
start = time.time() 

# 개발자 답변이 중복이 있는 앱만 추출
duplicate_description = dev_key.groupby(['app_id']).count()['dev_answer'].loc[dev_key.groupby(['app_id']).count()['dev_answer'] > 1]

length = len(duplicate_description)
app_id_group = dev_key.groupby(['app_id'])
    
# 개발자 답변 중복이 있는 앱의 개발자 설명들을 개별로 추출
for i in range(length):
    each_duplicate_key = app_id_group.get_group(duplicate_description.index[i])
    
    # 중복이 있는 개별 앱들의 개발자 답변을 합쳐서 temp변수에 담음 (','을 기준으로 합침)
    temp = ""
    for j in range(len(each_duplicate_key)):
        if j == len(each_duplicate_key) - 1:
            temp = temp + each_duplicate_key['dev_answer'][each_duplicate_key.index[j]]
        else:    
            temp = temp + each_duplicate_key['dev_answer'][each_duplicate_key.index[j]] + ','
    
    # temp변수에 담긴 개발자 답변들을 기존 개발자 설명 컬럼에 널기
    dev_key_base.loc[dev_key_base['app_id'] == duplicate_description.index[i],'dev_answer'] = temp
    
# 원소를 리스트 형태로 저장    
# dev_key_base['dev_answer'] = dev_key_base['dev_answer'].str.split(',')

end = time.time()     
print(f"{end - start:.5f} sec")

dev_key_base

2.58521 sec


Unnamed: 0,app_id,dev_answer
0,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,"custom,popular,accordion,frequently,refund,cus..."
1,9e4748a9-7eda-4814-83b6-0537d44152b1,"increase,sales,global,help,visitors,increase,s..."
2,0ef0087f-3ae5-4dbc-84e0-193b576d82ed,"button,fortune,checkout,redirect,button,autoco..."
3,d1476138-a608-4bb9-8d39-b30f3ca7617d,"layout,customise,ease,pages,individual,brand,s..."
4,7aac2a1f-ff03-4f38-aeb7-7619403a6f05,"websites,app,download,simple,quick,minutes,app..."
...,...,...
4304,a2197ba7-c0d0-4954-9b37-09c7f27ec919,"inquiries,cart,button,product,custom,customers..."
4305,28cd2f9d-61ea-447f-ae7d-e67440352d19,"textbox,upload,unlimited,customers,decorating,..."
4306,f6fc54ce-9999-4180-8e7e-c899bbf30bbd,"customer,increase,sales,built,forms,templates,..."
4307,95d883a8-7dba-443d-a6ee-f018d4446985,"color,product,shopify,swatch,themes,choose,ins..."


- 앱ID를 기준으로 리뷰들을 리스트 형태로 저장
  - 결측값이 있는 행은 모두 제거
  - 개발자 답변과 마찬가지로 주요 키워드를 추출해서 저장
  - 데이터가 44만건이 넘으므로 그냥 사용하려 했더니 24시간이 부족했다... 그래서 중복 건 중 첫번째와 마지막만 사용

In [14]:
review_body = reviews[['app_id','body']].rename(columns = {'body' : 'review'})
review_body.dropna(axis=0, inplace=True)
review_body

Unnamed: 0,app_id,review
0,b1da53a4-0474-4700-9620-bf386bc033fb,Great and super fast customer service! Highly ...
1,b1da53a4-0474-4700-9620-bf386bc033fb,"Still setting up my store, and after initially..."
2,b1da53a4-0474-4700-9620-bf386bc033fb,"This is an excellent search app, which they ha..."
3,b1da53a4-0474-4700-9620-bf386bc033fb,"A+, great great great customer service! thanks..."
4,b1da53a4-0474-4700-9620-bf386bc033fb,"I'm begginig to use this app, the search engin..."
...,...,...
447312,1a4accf8-b57e-4c08-a07c-fe5db0f0a0bc,The app is cool and simple to use which I like...
447313,1a4accf8-b57e-4c08-a07c-fe5db0f0a0bc,"Such a great thing, and great customer service..."
447314,1a4accf8-b57e-4c08-a07c-fe5db0f0a0bc,Just found Privy! I can't wait to start recei...
447315,1a4accf8-b57e-4c08-a07c-fe5db0f0a0bc,"Wow, overnight got tons of new email signups a..."


In [15]:
review_body_first = review_body.drop_duplicates(subset='app_id', keep='first')

review_body_last = review_body.drop_duplicates(subset='app_id', keep='last')

review_l_f = pd.concat([review_body_first,review_body_last], axis=0)
review_l_f

Unnamed: 0,app_id,review
0,b1da53a4-0474-4700-9620-bf386bc033fb,Great and super fast customer service! Highly ...
10,a31e5ac0-c672-4208-bc7c-ad8b0a50c86f,These guys have excellent support. I unfortuna...
20,70da5100-89ee-4ca5-82e1-d2f5b9c5f36f,The Customer Support from this Team is simply ...
21,f8f7c560-1fc5-480f-a36e-da80685d4dbf,I used this App to help me get right size char...
22,46fc5d8d-5a76-4db9-bfbb-493217fea298,"Soooooo happy to have found your app, after ho..."
...,...,...
426050,4285daa9-6875-42af-962f-ed66fa207704,I needed an affordable option to add product o...
428615,c769a28f-379a-4346-8871-915b558a099d,The Printful team is the best we have worked w...
431523,0c29ae5e-01e0-4efb-959f-40d9041db6fa,"Excellent application, I like my site to be cl..."
433183,da066a0b-53fc-42a2-b329-e0cce559f7ea,Awesome. Simple and easy to use and very easy ...


In [16]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [34]:
start = time.time() 

for k in range(len(review_l_f)):
    K = review_l_f.index[k]
    try:
        tfidfv = TfidfVectorizer().fit([review_l_f['review'][K]])
        # print(tfidfv.transform([dev_key['dev_answer'][0]]).toarray())
        # print(tfidfv.vocabulary_)

        # 후보 키워드 추출
        candidates = tfidfv.get_feature_names()

        # 원래 문장과 후보 키워드를 인코딩
        doc_embedding = model.encode([review_l_f['review'][K]])
        candidate_embeddings = model.encode(candidates)

        # 둘의 코사인 유사도를 계산해서 키워드 top3 추출
        top_n = 3
        distances = cosine_similarity(doc_embedding, candidate_embeddings)
        keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

        # 추출한 키워드 삽입
        review_l_f['review'][K] = ','.join(keywords)
    
    except:
        review_l_f['review'][K] = ""
        print("error : ", k, end=' ')
    
end = time.time()     
print(f"{end - start:.5f} sec")

review_l_f

error :  10 error :  11 error :  18 error :  24 error :  51 error :  53 error :  62 error :  76 error :  77 error :  78 error :  91 error :  92 error :  95 error :  98 error :  99 error :  107 error :  108 error :  110 error :  114 error :  115 error :  116 error :  117 error :  119 error :  120 error :  121 error :  124 error :  128 error :  132 error :  140 error :  150 error :  154 error :  155 error :  157 error :  166 error :  167 error :  168 error :  172 error :  174 error :  176 error :  178 error :  182 error :  185 error :  191 error :  193 error :  201 error :  202 error :  204 error :  205 error :  212 error :  222 error :  225 error :  231 error :  236 error :  240 error :  245 error :  250 error :  253 error :  255 error :  263 error :  264 error :  269 error :  278 error :  283 error :  298 error :  300 error :  301 error :  303 error :  305 error :  307 error :  313 error :  314 error :  315 error :  317 error :  318 error :  319 error :  325 error :  327 error :  328 e

Unnamed: 0,app_id,review
0,b1da53a4-0474-4700-9620-bf386bc033fb,"analytics,fast,customizable"
10,a31e5ac0-c672-4208-bc7c-ad8b0a50c86f,"guys,desired,needed"
20,70da5100-89ee-4ca5-82e1-d2f5b9c5f36f,"quick,amazing,responsive"
21,f8f7c560-1fc5-480f-a36e-da80685d4dbf,"last,mails,weeks"
22,46fc5d8d-5a76-4db9-bfbb-493217fea298,"hours,app,happy"
...,...,...
426050,4285daa9-6875-42af-962f-ed66fa207704,"emails,affordable,bakery"
428615,c769a28f-379a-4346-8871-915b558a099d,"always,best,updating"
431523,0c29ae5e-01e0-4efb-959f-40d9041db6fa,"shopping,clean,free"
433183,da066a0b-53fc-42a2-b329-e0cce559f7ea,"simple,awesome,easy"


In [56]:
review_l_f.reset_index(drop=True, inplace=True)
review_body_base = review_l_f.drop_duplicates(['app_id']).reset_index(drop=True)
print(review_body_base.shape)

(3733, 2)


In [57]:
start = time.time()

# 리뷰가 중복이 있는 앱만 추출
duplicate_body = review_l_f.groupby(['app_id']).count()['review'].loc[review_l_f.groupby(['app_id']).count()['review'] > 1]

length = len(duplicate_body)
app_id_group = review_l_f.groupby(['app_id'])

# 리뷰가 중복이 있는 앱의 리뷰들을 개별로 추출
for i in range(length):
    each_duplicate_body = app_id_group.get_group(duplicate_body.index[i])
    
    # 중복이 있는 개별 앱들의 리뷰를 합쳐서 temp변수에 담음 (','을 기준으로 합침)
    temp = ""
    for j in range(len(each_duplicate_body)):
        if j == len(each_duplicate_body) - 1:
            temp = temp + each_duplicate_body['review'][each_duplicate_body.index[j]]
        else:    
            temp = temp + each_duplicate_body['review'][each_duplicate_body.index[j]] + ','
    
    # temp변수에 담긴 리뷰들을 기존 개발자 설명 컬럼에 널기
    review_body_base.loc[review_body_base['app_id'] == duplicate_body.index[i],'review'] = temp
    
# 원소를 리스트 형태로 저장    
# review_body_base['review'] = review_body_base['review'].str.split(',')    

end = time.time()     
print(f"{end - start:.5f} sec")

review_body_base

1.49425 sec


Unnamed: 0,app_id,review
0,b1da53a4-0474-4700-9620-bf386bc033fb,"analytics,fast,customizable,customisation,cust..."
1,a31e5ac0-c672-4208-bc7c-ad8b0a50c86f,"guys,desired,needed,brilliant,app,happy"
2,70da5100-89ee-4ca5-82e1-d2f5b9c5f36f,"quick,amazing,responsive,increase,install,solved"
3,f8f7c560-1fc5-480f-a36e-da80685d4dbf,"last,mails,weeks,bot,efficient,shopify"
4,46fc5d8d-5a76-4db9-bfbb-493217fea298,"hours,app,happy,table,support,youtube"
...,...,...
3728,a2197ba7-c0d0-4954-9b37-09c7f27ec919,"thediamonddaughters,customize,jewelry,website,..."
3729,1d5259bf-b0e2-4850-a11f-6890a2c9c189,"app,awesome,helpful,easy,website,apps"
3730,f6fc54ce-9999-4180-8e7e-c899bbf30bbd,"wow,nice,love,developer,bash,bashing"
3731,95d883a8-7dba-443d-a6ee-f018d4446985,"webyze,customizing,happy,outstanding,website,app"


### 데이터 join하기

In [58]:
join_df = pd.merge(left=app_info, right=categories_base, how='left', on='app_id')

join_df = pd.merge(left=join_df, right=dev_key_base, how='left', on='app_id')

join_df = pd.merge(left=join_df, right=review_body_base, how='left', on='app_id')

join_df

Unnamed: 0,app_id,title,rating,reviews_count,app_description,tagline,categories,dev_answer,review
0,9e4748a9-7eda-4814-83b6-0537d44152b1,Panda Language Translate,4.7,379,[Panda Language Translation improve your busin...,Translate your store into multiple languages,10,"increase,sales,global,help,visitors,increase,s...","thank,pretty,helpful,simplicity,love,helpful"
1,d1476138-a608-4bb9-8d39-b30f3ca7617d,Instant Brand Page,4.9,13,[Having a brand page gives your shoppers a qui...,A-Z Brand Index Page and Favourites Slider,10,"layout,customise,ease,pages,individual,brand,s...","responsive,friendly,helpful,sale,easy,automati..."
2,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,FAQ Accordion | Help Center,4.5,202,"[Create, organize and display Frequently Asked...","FAQ page, FAQ accordion menu for product Info ...",100,"custom,popular,accordion,frequently,refund,cus...","recommend,free,satisfying,lost,poor,bugs"
3,0ef0087f-3ae5-4dbc-84e0-193b576d82ed,Promote Me | Many apps in one,4.9,18,[Promote Me app is a bundle of apps that inclu...,"Spin Wheel,Currency Converter,Quick ATC Button...",109,"button,fortune,checkout,redirect,button,autoco...","app,apps,customizable,easy,makeprosimp,apps"
4,7aac2a1f-ff03-4f38-aeb7-7619403a6f05,Instalify,0.0,0,[We are trusted by hundreds of Shopify and Sho...,Supercharge Your Mobile App Installs,10,"websites,app,download,simple,quick,minutes,app...",
...,...,...,...,...,...,...,...,...,...
4745,a2197ba7-c0d0-4954-9b37-09c7f27ec919,Request a Quote & Hide Prices,4.9,26,[Add “Get a Quote” button and your phone numbe...,Add “Get a Quote” button with custom contact f...,9,"inquiries,cart,button,product,custom,customers...","thediamonddaughters,customize,jewelry,website,..."
4746,28cd2f9d-61ea-447f-ae7d-e67440352d19,Product Options and Customizer,4.2,81,[FREE Installation Service. Just follow these ...,Create extra and unlimited product options (pr...,10,"textbox,upload,unlimited,customers,decorating,...","app,apps,easy,value,good,easy"
4747,f6fc54ce-9999-4180-8e7e-c899bbf30bbd,Custom Reviews,3.0,4,[Custom Reviews lets you add forms to your Sho...,Get valuable customer feedbacks on your products,9,"customer,increase,sales,built,forms,templates,...","wow,nice,love,developer,bash,bashing"
4748,95d883a8-7dba-443d-a6ee-f018d4446985,Swatches by Webyze,4.4,158,[Swatches - Add color swatches to your Shopify...,Add stylish color/image Swatches to your produ...,10,"color,product,shopify,swatch,themes,choose,ins...","webyze,customizing,happy,outstanding,website,app"


In [59]:
join_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4750 entries, 0 to 4749
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   app_id           4750 non-null   object 
 1   title            4750 non-null   object 
 2   rating           4750 non-null   float64
 3   reviews_count    4750 non-null   int64  
 4   app_description  4750 non-null   object 
 5   tagline          4750 non-null   object 
 6   categories       4750 non-null   object 
 7   dev_answer       4309 non-null   object 
 8   review           3733 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 371.1+ KB


In [60]:
# join_df.fillna('[]', inplace=True)

### 불공정한 평점 조정하기 (베이지안 추정치 사용)
  - weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C
    - R = average for the app (mean) = (Rating)
    - v = number of review for the app = (review)
    - m = minimum review required to be listed in the Top 500
    - C = the mean review across the whole report

In [61]:
R = app_info['rating']

v = app_info['reviews_count']

m = app_info['reviews_count'].quantile(0.9)

C = app_info['reviews_count'].mean()

print(m)
print(C)

156.10000000000036
94.17115789473684


In [62]:
WR = (v / (v+m)) * R + (m / (v+m)) * C
join_df['wr_rating'] = WR
join_df

Unnamed: 0,app_id,title,rating,reviews_count,app_description,tagline,categories,dev_answer,review,wr_rating
0,9e4748a9-7eda-4814-83b6-0537d44152b1,Panda Language Translate,4.7,379,[Panda Language Translation improve your busin...,Translate your store into multiple languages,10,"increase,sales,global,help,visitors,increase,s...","thank,pretty,helpful,simplicity,love,helpful",30.800631
1,d1476138-a608-4bb9-8d39-b30f3ca7617d,Instant Brand Page,4.9,13,[Having a brand page gives your shoppers a qui...,A-Z Brand Index Page and Favourites Slider,10,"layout,customise,ease,pages,individual,brand,s...","responsive,friendly,helpful,sale,easy,automati...",87.308207
2,d6e49a3c-2f9f-4bfa-8c26-5d024faf2241,FAQ Accordion | Help Center,4.5,202,"[Create, organize and display Frequently Asked...","FAQ page, FAQ accordion menu for product Info ...",100,"custom,popular,accordion,frequently,refund,cus...","recommend,free,satisfying,lost,poor,bugs",43.588712
3,0ef0087f-3ae5-4dbc-84e0-193b576d82ed,Promote Me | Many apps in one,4.9,18,[Promote Me app is a bundle of apps that inclu...,"Spin Wheel,Currency Converter,Quick ATC Button...",109,"button,fortune,checkout,redirect,button,autoco...","app,apps,customizable,easy,makeprosimp,apps",84.941515
4,7aac2a1f-ff03-4f38-aeb7-7619403a6f05,Instalify,0.0,0,[We are trusted by hundreds of Shopify and Sho...,Supercharge Your Mobile App Installs,10,"websites,app,download,simple,quick,minutes,app...",,94.171158
...,...,...,...,...,...,...,...,...,...,...
4745,a2197ba7-c0d0-4954-9b37-09c7f27ec919,Request a Quote & Hide Prices,4.9,26,[Add “Get a Quote” button and your phone numbe...,Add “Get a Quote” button with custom contact f...,9,"inquiries,cart,button,product,custom,customers...","thediamonddaughters,customize,jewelry,website,...",81.425139
4746,28cd2f9d-61ea-447f-ae7d-e67440352d19,Product Options and Customizer,4.2,81,[FREE Installation Service. Just follow these ...,Create extra and unlimited product options (pr...,10,"textbox,upload,unlimited,customers,decorating,...","app,apps,easy,value,good,easy",63.434491
4747,f6fc54ce-9999-4180-8e7e-c899bbf30bbd,Custom Reviews,3.0,4,[Custom Reviews lets you add forms to your Sho...,Get valuable customer feedbacks on your products,9,"customer,increase,sales,built,forms,templates,...","wow,nice,love,developer,bash,bashing",91.893303
4748,95d883a8-7dba-443d-a6ee-f018d4446985,Swatches by Webyze,4.4,158,[Swatches - Add color swatches to your Shopify...,Add stylish color/image Swatches to your produ...,10,"color,product,shopify,swatch,themes,choose,ins...","webyze,customizing,happy,outstanding,website,app",49.014065


- 데이터 저장

In [63]:
join_df.to_csv('./temp_data/app_df.csv',index = False)

## 추천에 필요한 "사용자" 데이터 준비하기

- review데이터의 작성자ID를 기준으로 중복값을 제거한 후 리뷰와 날짜의 원소를 묶어 저장 
  - 평점 별 추천은 나중에 해보기로.....

In [3]:
user_review = reviews[['author','app_id','rating','posted_at','body']].rename(columns = {'body' : 'review_body'})
user_review.dropna(axis=0, inplace=True)
user_review['posted_at'] = pd.to_datetime(user_review['posted_at'])
user_review = user_review.astype(str)

user_review

Unnamed: 0,author,app_id,rating,posted_at,review_body
0,Consuela,b1da53a4-0474-4700-9620-bf386bc033fb,5,2020-08-06,Great and super fast customer service! Highly ...
1,L'Atelier Global,b1da53a4-0474-4700-9620-bf386bc033fb,5,2020-08-04,"Still setting up my store, and after initially..."
2,city'super E-Shop,b1da53a4-0474-4700-9620-bf386bc033fb,5,2020-08-04,"This is an excellent search app, which they ha..."
3,PortableHandwashing.com,b1da53a4-0474-4700-9620-bf386bc033fb,5,2020-07-30,"A+, great great great customer service! thanks..."
4,ICCTUNING,b1da53a4-0474-4700-9620-bf386bc033fb,5,2020-07-28,"I'm begginig to use this app, the search engin..."
...,...,...,...,...,...
447312,Life Force,1a4accf8-b57e-4c08-a07c-fe5db0f0a0bc,2,2015-08-10,The app is cool and simple to use which I like...
447313,Luka Inc,1a4accf8-b57e-4c08-a07c-fe5db0f0a0bc,5,2015-07-16,"Such a great thing, and great customer service..."
447314,Say It Forward,1a4accf8-b57e-4c08-a07c-fe5db0f0a0bc,5,2015-06-29,Just found Privy! I can't wait to start recei...
447315,Cupboard Distributing,1a4accf8-b57e-4c08-a07c-fe5db0f0a0bc,5,2015-06-26,"Wow, overnight got tons of new email signups a..."


- 사용자 별로 리뷰한 앱 맵핑
  - 데이터를 줄이기 위해 drop_duplicates를 이용해 사용자 별 하나의 앱만 리뷰함

In [4]:
user_review_temp = user_review[['author','app_id']]
user_review_app = user_review_temp.drop_duplicates(subset='author', keep='first')
print(user_review_app.shape)
user_review_app

(299204, 2)


Unnamed: 0,author,app_id
0,Consuela,b1da53a4-0474-4700-9620-bf386bc033fb
1,L'Atelier Global,b1da53a4-0474-4700-9620-bf386bc033fb
2,city'super E-Shop,b1da53a4-0474-4700-9620-bf386bc033fb
3,PortableHandwashing.com,b1da53a4-0474-4700-9620-bf386bc033fb
4,ICCTUNING,b1da53a4-0474-4700-9620-bf386bc033fb
...,...,...
447311,Mudl Shop 2,1a4accf8-b57e-4c08-a07c-fe5db0f0a0bc
447312,Life Force,1a4accf8-b57e-4c08-a07c-fe5db0f0a0bc
447313,Luka Inc,1a4accf8-b57e-4c08-a07c-fe5db0f0a0bc
447314,Say It Forward,1a4accf8-b57e-4c08-a07c-fe5db0f0a0bc


In [5]:
# user_review_temp = user_review[['author','app_id']]

# user_review_app = user_review_temp.drop_duplicates(['author']).reset_index(drop=True)
# print(user_review_app.shape)

In [6]:
# start = time.time() 

# # 중복이 있는 것만 추출
# duplicate_app = user_review_temp.groupby(['author']).count()['app_id'].loc[user_review_temp.groupby(['author']).count()['app_id'] > 1]

# group = user_review_temp.groupby(['author'])
# length = len(duplicate_app)

# # 중복이 있는 것을 개별로 추출
# for i in range(length):
#     each_duplicate_app = group.get_group(duplicate_app.index[i])
    
#     # 개별 것들을 합쳐서 temp변수에 담음 (','을 기준으로 합침)
#     temp = ""
#     each_length = len(each_duplicate_app)
    
#     for j in range(each_length):
#         if j == each_length - 1:
#             temp = temp + each_duplicate_app['app_id'][each_duplicate_app.index[j]]
#         else:    
#             temp = temp + each_duplicate_app['app_id'][each_duplicate_app.index[j]] + ','
    
#     # temp변수에 담긴 것을 컬럼에 널기
#     user_review_app.loc[user_review_app['author'] == duplicate_app.index[i],'app_id'] = temp
    
# # 원소를 리스트 형태로 저장    
# user_review_app['app_id'] = user_review_app['app_id'].str.split(',')    

# end = time.time()     
# print(f"{end - start:.5f} sec")

# user_review_app

- 작성자 별 평점을 추출
  - 데이터를 줄이기 위해 drop_duplicates를 이용해 사용자 별 하나의 앱만 평점입력함

In [7]:
user_review_temp = user_review[['author','rating']]
user_review_rating = user_review_temp.drop_duplicates(subset='author', keep='first')
print(user_review_rating.shape)
user_review_rating

(299204, 2)


Unnamed: 0,author,rating
0,Consuela,5
1,L'Atelier Global,5
2,city'super E-Shop,5
3,PortableHandwashing.com,5
4,ICCTUNING,5
...,...,...
447311,Mudl Shop 2,5
447312,Life Force,2
447313,Luka Inc,5
447314,Say It Forward,5


In [8]:
# user_review_rating = user_review_temp.drop_duplicates(['author']).reset_index(drop=True)
# print(user_review_rating.shape)

In [9]:
# start = time.time() 

# # 중복이 있는 것만 추출
# duplicate_app = user_review_temp.groupby(['author']).count()['rating'].loc[user_review_temp.groupby(['author']).count()['rating'] > 1]

# group = user_review_temp.groupby(['author'])
# length = len(duplicate_app)

# # 중복이 있는 것을 개별로 추출
# for i in range(length):
#     each_duplicate_app = group.get_group(duplicate_app.index[i])
    
#     # 개별 것들을 합쳐서 temp변수에 담음 (','을 기준으로 합침)
#     temp = ""
#     each_length = len(each_duplicate_app)
    
#     for j in range(each_length):
#         if j == each_length - 1:
#             temp = temp + each_duplicate_app['rating'][each_duplicate_app.index[j]]
#         else:    
#             temp = temp + each_duplicate_app['rating'][each_duplicate_app.index[j]] + ','
    
#     # temp변수에 담긴 것을 컬럼에 널기
#     user_review_rating.loc[user_review_rating['author'] == duplicate_app.index[i],'rating'] = temp
    
# # 원소를 리스트 형태로 저장    
# user_review_rating['rating'] = user_review_rating['rating'].str.split(',')    

# end = time.time()     
# print(f"{end - start:.5f} sec")

# user_review_rating

- 날짜를 리스트로 변환

In [10]:
# user_review_temp = user_review[['author','posted_at']]

# user_review_posted_at = user_review_temp.drop_duplicates(['posted_at']).reset_index(drop=True)
# print(user_review_posted_at.shape)

In [11]:
# start = time.time() 

# # 중복이 있는 것만 추출
# duplicate_app = user_review_temp.groupby(['author']).count()['posted_at'].loc[user_review_temp.groupby(['author']).count()['posted_at'] > 1]

# group = user_review_temp.groupby(['author'])
# length = len(duplicate_app)

# # 중복이 있는 것을 개별로 추출
# for i in range(length):
#     each_duplicate_app = group.get_group(duplicate_app.index[i])
    
#     # 개별 것들을 합쳐서 temp변수에 담음 (','을 기준으로 합침)
#     temp = ""
#     each_length = len(each_duplicate_app)
    
#     for j in range(each_length):
#         if j == each_length - 1:
#             temp = temp + each_duplicate_app['posted_at'][each_duplicate_app.index[j]]
#         else:    
#             temp = temp + each_duplicate_app['posted_at'][each_duplicate_app.index[j]] + ','
    
#     # temp변수에 담긴 것을 컬럼에 널기
#     user_review_posted_at.loc[user_review_posted_at['author'] == duplicate_app.index[i],'posted_at'] = temp
    
# # 원소를 리스트 형태로 저장    
# user_review_posted_at['posted_at'] = user_review_posted_at['posted_at'].str.split(',')    

# end = time.time()     
# print(f"{end - start:.5f} sec")

# user_review_posted_at

- 작성자 별 리뷰를 추출 
  - 데이터를 줄이기 위해 drop_duplicates를 이용해 사용자 별 하나의 앱만 평점입력함
  - 사용자 리뷰는 전처리 시간이 오래걸려 데이터셋을 임의로 줄임

In [13]:
user_review_temp = user_review[['author','review_body']]
user_review_temp = user_review_temp.drop_duplicates(subset='author', keep='first')
user_review_temp = user_review_temp.iloc[int(299204/3):int(299204/3)+5000,:]
user_review_temp = user_review_temp.reset_index(drop=True)
user_review_temp

Unnamed: 0,author,review_body
0,Squishyvibe,"Great app, does what it says."
1,bluemoonbamboo,I was so happy to come across this app! Having...
2,CC Outlet HK,"this app is easy to use, it would be more idea..."
3,Manamasks,"Great app. Simple to use and understand, the d..."
4,Miracolo Fashion,So far so good. In one day menu done. :D\n-b-
...,...,...
4995,FARASIAA JEWELRY AND ACCESSORIES,The support team is very helpful and friendly....
4996,ScootersAndHelmets,The app is extremely easy to use and looks gre...
4997,SlickMan,Great way to encourage your buyers to buy more...
4998,Chahin Egypt,I used it and it is good and i like it I wish ...


In [14]:
import googletrans
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [16]:
start = time.time() 

for k in range(len(user_review_temp)):
    K = user_review_temp.index[k]
    try:
        tfidfv = TfidfVectorizer().fit([user_review_temp['review_body'][K]])
        # print(tfidfv.transform([dev_key['dev_answer'][0]]).toarray())
        # print(tfidfv.vocabulary_)

        # 후보 키워드 추출
        candidates = tfidfv.get_feature_names()

        # 원래 문장과 후보 키워드를 인코딩
        doc_embedding = model.encode([user_review_temp['review_body'][K]])
        candidate_embeddings = model.encode(candidates)

        # 둘의 코사인 유사도를 계산해서 키워드 top3 추출
        top_n = 3
        distances = cosine_similarity(doc_embedding, candidate_embeddings)
        keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

        # 추출한 키워드 삽입
        user_review_temp['review_body'][K] = ','.join(keywords)
    
    except:
        user_review_temp['review_body'][K] = ""
        print("error : ", k, end=' ')
    
end = time.time()     
print(f"{end - start:.5f} sec")

user_review_temp

662.26127 sec


Unnamed: 0,author,review_body
0,Squishyvibe,"says,great,app"
1,bluemoonbamboo,"app,happy,customizer"
2,CC Outlet HK,"app,ideal,easy"
3,Manamasks,"app,friendly,intuitive"
4,Miracolo Fashion,"done,day,menu"
...,...,...
4995,FARASIAA JEWELRY AND ACCESSORIES,"fast,helpful,knowledgeable"
4996,ScootersAndHelmets,"app,easy,effective"
4997,SlickMan,"buy,app,buyers"
4998,Chahin Egypt,"help,thanks,wish"


In [17]:
# user_review_review_body = user_review_temp.drop_duplicates(['review_body']).reset_index(drop=True)
# print(user_review_review_body.shape)

In [18]:
# start = time.time() 

# # 중복이 있는 것만 추출
# duplicate_app = user_review_temp.groupby(['author']).count()['review_body'].loc[user_review_temp.groupby(['author']).count()['review_body'] > 1]

# group = user_review_temp.groupby(['author'])
# length = len(duplicate_app)

# # 중복이 있는 것을 개별로 추출
# for i in range(length):
#     each_duplicate_app = group.get_group(duplicate_app.index[i])
    
#     # 개별 것들을 합쳐서 temp변수에 담음 (','을 기준으로 합침)
#     temp = ""
#     each_length = len(each_duplicate_app)
    
#     for j in range(each_length):
#         if j == each_length - 1:
#             temp = temp + each_duplicate_app['review_body'][each_duplicate_app.index[j]]
#         else:    
#             temp = temp + each_duplicate_app['review_body'][each_duplicate_app.index[j]] + ','
    
#     # temp변수에 담긴 것을 컬럼에 널기
#     user_review_review_body.loc[user_review_review_body['author'] == duplicate_app.index[i],'review_body'] = temp
    
# # 원소를 리스트 형태로 저장    
# # user_review_review_body['review_body'] = user_review_review_body['review_body'].str.split(',')    

# end = time.time()     
# print(f"{end - start:.5f} sec")

# user_review_review_body

In [19]:
user_review_review_body = user_review_temp.copy()

- 데이터 저장하기

In [20]:
user_review_app.to_csv('./temp_data/user_review_app.csv',index = False)
user_review_rating.to_csv('./temp_data/user_review_rating.csv',index = False)
#user_review_posted_at.to_csv('./temp_data/user_review_posted_at.csv',index = False)
user_review_review_body.to_csv('./temp_data/user_review_review_body.csv',index = False)