# 아마존 뷰티 제품 평점 정보:

 - 2M 개 이상의 고객 리뷰와 평점 정보를 포함한 데이터셋을 가지고 인기 제품 추천과 을 만들어 보자. 앞서 영화 추천과 비슷하게 진행가능하다.

 - 데이터셋에는 총 4가지 정보가 포함되어 있다:

   - 사용자 ID
   - 상품 ID (ASIN이라 부른다)
   - 평점 정보 (1-5)
   - 평점이 주어진 시간

* 앞서 2일차와 4일차 강의 내용을 기반으로 인기도 기반의 추천과 SVD 기반의 추천을 만들어 보자 

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
amazon_ratings = pd.read_csv("https://grepp-reco-test.s3.ap-northeast-2.amazonaws.com/ratings_Beauty.csv")

4가지 정보 중에 하나라도 비어있는 레코드들을 모두 날리고 처음 5개의 레코드를 살펴본다

In [None]:
amaRatings = amazon_ratings.pivot_table(index=['UserId'], columns=['ProductId'], values='Rating')

In [11]:
amazon_ratings.shape

(2023070, 4)

In [12]:
amazon_ratings = amazon_ratings.dropna()
amazon_ratings.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200
2,A1Z513UWSAAO0F,558925278,5.0,1404691200
3,A1WMRR494NWEWV,733001998,4.0,1382572800
4,A3IAAVS479H7M7,737104473,1.0,1274227200


In [30]:
amazon_ratings.describe(include='all')

Unnamed: 0,UserId,ProductId,Rating,Timestamp
count,2023070,2023070,2023070.0,2023070.0
unique,1210271,249274,,
top,A3KEZLJ59C1JVH,B001MA0QY2,,
freq,389,7533,,
mean,,,4.149036,1360389000.0
std,,,1.311505,46118600.0
min,,,1.0,908755200.0
25%,,,4.0,1350259000.0
50%,,,5.0,1372810000.0
75%,,,5.0,1391472000.0


# 인기도 기반 추천 만들기

정보가 없는 사용자들에게 가장 쉽게 사용할 수 있는 형태의 추천 방식

리뷰의 수가 가장 큰 상품 10개를 계산해본다. 

In [7]:
popular_products = pd.DataFrame(amazon_ratings.groupby('ProductId')['Rating'].count())
most_popular = popular_products.sort_values('Rating', ascending=False)
most_popular.head(10)

Unnamed: 0_level_0,Rating
ProductId,Unnamed: 1_level_1
B001MA0QY2,7533
B0009V1YR8,2869
B0043OYFKU,2477
B0000YUXI0,2143
B003V265QW,2088
B000ZMBSPE,2041
B003BQ6QXK,1918
B004OHQR1Q,1885
B00121UVU0,1838
B000FS05VG,1589


이 10개를 바 그래프로 그려본다

# 리뷰 수가 어느 정도 이상되는 상품을 대상으로 평균 평점이 4이상인 뷰티 상품을 리턴하게 해보자. 

# 모델 기반 CF 추천 시스템 만들기

평점 행렬을 기반으로 SVD++를 사용해보는 방식. 앞서 사용해본 surprise 모듈을 사용하던지 scikit-learn의 TruncatedSVD를 사용해본다

In [18]:
!pip install surprise



In [9]:
amazon_ratings1 = amazon_ratings.head(10000)

In [10]:
ratings_utility_matrix = amazon_ratings1.pivot_table(values='Rating', index='UserId', columns='ProductId', fill_value=0)
ratings_utility_matrix.head()

ProductId,0205616461,0558925278,0733001998,0737104473,0762451459,1304139212,1304139220,130414089X,130414643X,1304146537,...,B000052YPE,B000052YPF,B000052YPG,B000052YPH,B000052YPM,B000052YPU,B000052YPV,B000052YPY,B000052YQ0,B000052YQ2
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00205921JHJK5X9LNP42,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A024581134CV80ZBLIZTZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A03056581JJIOL5FSKJY7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A03099101ZRK4K607JVHH,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0505229A7NSH3FRXRR4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
ratings_utility_matrix.shape

(9697, 886)

In [12]:
X = ratings_utility_matrix.T
X.head()

UserId,A00205921JHJK5X9LNP42,A024581134CV80ZBLIZTZ,A03056581JJIOL5FSKJY7,A03099101ZRK4K607JVHH,A0505229A7NSH3FRXRR4,A05492663T95KW63BR75K,A059547920Q3LZVFHLPI3,A07410232KYRFR25CIUGJ,A082796624UNM47DSAI6K,A0864963DOAY7LXGS5I6,...,AZW1HXXYAC15B,AZWRTJPN7NXT,AZWTXHXZXFAYP,AZYQEFB9Y5N22,AZZHB6U54UDYW,AZZHJZP4GQPPZ,AZZNK89PXD006,AZZOFVMQC0BJG,AZZQXL8VDCFTV,AZZTJQ7CQZUD8
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
205616461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
558925278,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
733001998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
737104473,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
762451459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
X.shape

(886, 9697)

In [14]:
X1 = X

In [15]:
# from sklearn.decomposition import TruncatedSVD

# SVD = TruncatedSVD(n_components=10)
# decomposed_matrix = SVD.fit_transform(X)
# decomposed_matrix.shape

(886, 10)

In [16]:
correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

(886, 886)

In [8]:
import surprise
from surprise import Dataset
from surprise import Reader
from surprise import KNNBasic
from surprise import SVD
from surprise import NormalPredictor
from surprise.model_selection import GridSearchCV

import heapq

from collections import defaultdict
from operator import itemgetter

In [31]:
# 학습 옵션 설정
param_grid = {
    'n_epochs': [10,20],
    'lr_all': [0.005, 0.010],
    'n_factors' : [50,100]
}

In [7]:
# 3-fold
gs= GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

NameError: name 'GridSearchCV' is not defined

In [24]:
reader = Reader(line_format='UserId ProductId Rating Timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file("amazon_ratings.csv", reader=reader)

ValueError: line_format parameter is incorrect.