In [1]:
import pandas as pd
import numpy as np
import os
import glob
import gc
from itertools import islice
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from surprise import SVD, accuracy
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.model_selection.split import train_test_split


# 데이터 준비
## 제품 데이터 준비

In [38]:
bedding_df = pd.read_csv('./data_storage/Bedding.csv')
bedding_df['Category'] = 'Bedding'

kids_df = pd.read_csv('./data_storage/Kids_Home_Store.csv')
kids_df['Category'] = 'Kids_Home_Store'

kitchen_df = pd.read_csv('./data_storage/Kitchen_Dining.csv')
kitchen_df['Category'] = 'Kitchen_Dining'

seasonal_df = pd.read_csv('./data_storage/Seasonal_Decor.csv')
seasonal_df['Category'] = 'Seasonal_Decor'

item_df = pd.concat([bedding_df, kids_df, kitchen_df, seasonal_df])

# print('Item Dataframe')
display(item_df)

Unnamed: 0,Name,Asin,Price,Link,Img,Category
0,Sealy Baby - Stain Protection Waterproof Fitte...,B001FCK32C,$15.60,https://www.amazon.com/Sealy-Protection-Waterp...,,Bedding
1,"SafeRest Mattress Protector – Queen, Premium, ...",B003PWNH4Q,$37.99,https://www.amazon.com/SafeRest-Hypoallergenic...,,Bedding
2,SafeRest Premium Hypoallergenic Bed Bug Proof ...,B003Q0U740,$11.11,https://www.amazon.com/SafeRest-Hypoallergenic...,,Bedding
3,SafeRest Zippered Mattress Protector - Premium...,B004BAEF7E,$37.71,https://www.amazon.com/SafeRest-Premium-Zipper...,,Bedding
4,Bed Band Not Made in China. 100% USA Worker As...,B004I3VDWY,$12.99,https://www.amazon.com/Bed-Band-Assembled-Susp...,,Bedding
...,...,...,...,...,...,...
595,"Neon Light Strip, Flexible Neon Rope Light 12v...",B09YYD728H,$29.99,https://www.amazon.com/Flexible-Freedom-Contro...,,Seasonal_Decor
596,"Ostritec Solar Outdoor Watering Can Lights, Wa...",B09ZHLMTXH,$29.99,https://www.amazon.com/Ostritec-Watering-Water...,,Seasonal_Decor
597,"ZUUKOO LIGHT Outdoor String Lights, RGB 48ft P...",B0B17FTDVV,$44.98,https://www.amazon.com/ZUUKOO-LIGHT-Waterproof...,,Seasonal_Decor
598,Happy Juneteenth Banner，Africa American Indepe...,B0B1PM4WXP,$11.98,https://www.amazon.com/Juneteenth-Banner%EF%BC...,,Seasonal_Decor


In [39]:
item_df.isnull().sum()

Name          0
Asin          0
Price         0
Link          0
Img         905
Category      0
dtype: int64

## 리뷰 데이터 준비
### 리뷰 데이터 합치기

In [40]:
review_category_list = ['bedding', 'kids_home_store', 'kitchen_dining', 'seasonal_decor']

for category in review_category_list:
    files_joined = os.path.join('./reviews/{}'.format(category), '*.csv')
    list_files = glob.glob(files_joined)




In [41]:
files_joined

'./reviews/bedding\\*.csv'

In [None]:
print("** multiple csv files -> a merged dataFrame **")

# join된 모든 파일 merge
rating_df = pd.concat(map(pd.read_csv, list_files), ignore_index=True).drop('Unnamed: 0', axis=1)

display(rating_df)

# 리뷰 데이터 정제

In [85]:
# rating_df = rating_df[['ID', 'Asin', 'Rating']]

# print("ID: 사용자 id","\nAsin: Amazon Standard Identification Number", "\nRating: 사용자 별점")

# display(rating_df)

ID: 사용자 id 
Asin: Amazon Standard Identification Number 
Rating: 사용자 별점


Unnamed: 0,ID,Asin,Rating
0,Chelsea Edwards,B001FCK32C,5
1,Atalanta,B001FCK32C,3
2,Marcos Chavez,B001FCK32C,5
3,Bonnie Barnette,B001FCK32C,5
4,Jess Ramos,B001FCK32C,5
...,...,...,...
40063,Joyce Byrne,B0B12WVHWG,3
40064,Carolyn Ellis,B0B12WVHWG,2
40065,Kellie Dragoo,B0B12WVHWG,1
40066,SARA,B0B12WVHWG,5


In [11]:
rating_df.shape

(40068, 6)

In [12]:
rating_df.isnull().sum()

Asin        0
ID          2
Date        0
Rating      0
Title       8
Body      155
dtype: int64

In [13]:
rating_df = rating_df.dropna(subset=['ID'])
rating_df.shape

(40066, 6)

## 유저 ID에 고유한 number 부여

In [14]:
id_list = list(set(rating_df['ID']))

In [15]:
id_number_dict = {}

for idx, id in enumerate(id_list):
    id_number_dict[id] = idx

In [16]:
print("id_number_dict 10개만 확인")
display(dict(islice(id_number_dict.items(), 10)))

id_number_dict 10개만 확인


{'clb': 0,
 'Anton Krilloff': 1,
 'A Jackson': 2,
 'Hope90': 3,
 'Mary P. Boutte': 4,
 'David Roach': 5,
 'A. Corona': 6,
 'MelanieB': 7,
 'Nashwin': 8,
 'Amy Bettasso': 9}

In [17]:
id_number_list = []

for id in rating_df['ID']:
    id_number_list.append(id_number_dict.get(id))

In [18]:
print("id_number_list 10개만 확인")
display(id_number_list[:10])

id_number_list 10개만 확인


[8804, 24100, 7837, 5142, 24675, 7340, 25544, 20762, 12197, 23132]

In [19]:
rating_df['User_Number'] = id_number_list

display(rating_df.head())

print('** rating_df info **')
print(rating_df.info())

Unnamed: 0,Asin,ID,Date,Rating,Title,Body,User_Number
0,B001FCK32C,Chelsea Edwards,5162022,5,Works perfect!,Exactly what I needed! Works perfect! Mattress...,8804
1,B001FCK32C,Atalanta,5102022,3,Fits well but not ideal,We bought an Ikea crib and Ikea has been out o...,24100
2,B001FCK32C,Marcos Chavez,482022,5,Nice & Comfy Protection,Fits perfect for toddler mattress to add comfo...,7837
3,B001FCK32C,Bonnie Barnette,3132022,5,Good protection for crib matress,Great protection for crib matress,5142
4,B001FCK32C,Jess Ramos,3102022,5,Must Have,My son spits up quiet often and this helps to ...,24675


** rating_df info **
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40066 entries, 0 to 40067
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Asin         40066 non-null  object
 1   ID           40066 non-null  object
 2   Date         40066 non-null  object
 3   Rating       40066 non-null  object
 4   Title        40058 non-null  object
 5   Body         39911 non-null  object
 6   User_Number  40066 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 2.4+ MB
None


## Asin에 고유한 number 부여

In [20]:
asin_list = list(set(item_df['Asin']))

In [21]:
asin_number_dict = {}

for idx, asin in enumerate(asin_list):
    asin_number_dict[asin] = idx

In [22]:
print("asin_number_dict 10개만 확인")
display(dict(islice(asin_number_dict.items(), 10)))

asin_number_dict 10개만 확인


{'B01L2XGRSC': 0,
 'B078R3NDPH': 1,
 'B07TSN5YRM': 2,
 'B01LBMPWVC': 3,
 'B081V5MV2K': 4,
 'B098Q2J57Q': 5,
 'B073DK9B83': 6,
 'B011U1LIK8': 7,
 'B07S242M8D': 8,
 'B01ARJ7SBM': 9}

In [23]:
asin_number_list = []

for asin in rating_df['Asin']:
    asin_number_list.append(asin_number_dict.get(asin))

In [111]:
rating_df['Asin_Number'] = asin_number_list

In [116]:
print('** rating_df Dataframe **')
display(rating_df.head())

print('\n** rating_df Information **')
display(rating_df.info())

** rating_df Dataframe **


Unnamed: 0,ID,Asin,Rating,User_Number,Asin_Number
0,Chelsea Edwards,B001FCK32C,5,7883,372
1,Atalanta,B001FCK32C,3,979,372
2,Marcos Chavez,B001FCK32C,5,24698,372
3,Bonnie Barnette,B001FCK32C,5,555,372
4,Jess Ramos,B001FCK32C,5,3676,372



** rating_df Information **
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40066 entries, 0 to 40067
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           40066 non-null  object
 1   Asin         40066 non-null  object
 2   Rating       40066 non-null  object
 3   User_Number  40066 non-null  int64 
 4   Asin_Number  40066 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.8+ MB


None

In [118]:
asin_number_list2 = []

for i in item_df['Asin']:
    asin_number_list2.append(asin_number_dict.get(i))
    
item_df['Asin_Number'] = asin_number_list2

print('** item_df Dataframe **')
display(item_df.head())

print('\n** item_df Information **')
display(item_df.info())

** item_df Dataframe **


Unnamed: 0,Asin,Category,Asin_Number
0,B001FCK32C,bedding,372
1,B003PWNH4Q,bedding,234
2,B003Q0U740,bedding,100
3,B004BAEF7E,bedding,388
4,B004I3VDWY,bedding,290



** item_df Information **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Asin         435 non-null    object
 1   Category     435 non-null    object
 2   Asin_Number  435 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 10.3+ KB


None

# 추천 알고리즘

## 추천 제품 개수

In [122]:
topK = 5

### 인기 기반 구현 Popular Based
- 가중 등급 시스템(weighted rating)
- WR = ( (v / (v + m)) * R) + ( (m / (v + m)) * C )

- R: 아이템의 평균 rating

- v: 아이템에 투표한 수

- m: 인기 항목에 나열되는 데 필요한 최소 투표수

- C: 전체 데이터셋을 통해서 얻은 평균 rating

In [126]:
def weighted_rating(v,m,R,C):
    return ( (v / (v + m)) * R) + ( (m / (v + m)) * C )

def assignPopularBasedScore(ratingDf, itemDf, userCol, itemCol, ratingCol):
    # pre processing
    voteCount = (
        ratingDf
        .groupby(itemCol,as_index=False)
        .agg( {userCol:'count', ratingCol:'mean'} )
        )
    voteCount.columns = [itemCol, 'voteCount', 'avg_rating']
    
    # calcuate input parameters
    C = np.mean(voteCount['avg_rating'])
    m = np.percentile(voteCount['voteCount'], 70)
    voteCount = voteCount[voteCount['voteCount'] >= m]
    R = voteCount['avg_rating']
    v = voteCount['voteCount']
    voteCount['weightedRating'] = weightedRating(v,m,R,C)
    
    # post processing
    voteCount = voteCount.merge(itemDf, on = [itemCol], how = 'left')
    popular_items = voteCount.loc[:,[itemCol, 'Category', 'voteCount', 'avg_rating', 'weightedRating']]
    
    return popular_items

# calcualte popularity based
pop_items = assignPopularBasedScore(rating_df, item_df, 'ID', 'Asin', 'Rating')
pop_items = pop_items.sort_values('weightedRating', ascending = False)

pop_items.reset_index(drop=True).head(10)

Unnamed: 0,Asin,Category,voteCount,avg_rating,weightedRating
0,B01LWQMM4V,bedding,100,4.92,4.441679
1,B09MDHDYYR,bedding,100,4.86,4.411679
2,B095PR8MM4,bedding,100,4.83,4.396679
3,B08H4KJPV7,bedding,100,4.82,4.391679
4,B07MJPNZWW,bedding,100,4.79,4.376679
5,B09Z2DJCML,bedding,100,4.75,4.356679
6,B08DDJ7BKP,bedding,100,4.73,4.346679
7,B07JG459R7,bedding,100,4.72,4.341679
8,B07WDFTL9F,bedding,100,4.7,4.331679
9,B01KG84CLI,bedding,100,4.69,4.326679


# Content-based
- 제품과 카테고리 기반 코사인 유사도를 계산해서 추천
- 사용자가 원하는 content에 대해서 벡터화 -> 벡터화된 데이터로 유사도만 계산해주면 사용 가능

- 근데 이거는 어차페 카테고리가 다 같아서 쓸모 없을 듯

In [128]:
item_df.head()

Unnamed: 0,Asin,Category,Asin_Number
0,B001FCK32C,bedding,372
1,B003PWNH4Q,bedding,234
2,B003Q0U740,bedding,100
3,B004BAEF7E,bedding,388
4,B004I3VDWY,bedding,290


In [129]:
rating_df.head()

Unnamed: 0,ID,Asin,Rating,User_Number,Asin_Number
0,Chelsea Edwards,B001FCK32C,5,7883,372
1,Atalanta,B001FCK32C,3,979,372
2,Marcos Chavez,B001FCK32C,5,24698,372
3,Bonnie Barnette,B001FCK32C,5,555,372
4,Jess Ramos,B001FCK32C,5,3676,372


In [131]:
def topKItems(itemId, topK, corrMat, mapName):
    
    # 상관계수 정렬 &nd topK 제품 선택
    topItems = corrMat[itemId,:].argsort()[-topK:][::-1] 
    topItems = [mapName[e] for e in topItems] 

    return topItems

# preprocessing
ratedItems = item_df.loc[item_df['Asin'].isin(rating_df['Asin'])].copy()

# extract the Category
Category = ratedItems['Category'].str.split(",", expand=True)

# get all possible Category
allCategory = set()
for c in Category.columns:
    distinctCategory = Category[c].str.lower().str.strip().unique()
    allCategory.update(distinctCategory)

# create item-Category matrix
itemCategoryMat = ratedItems[['Asin', 'Category']].copy()
itemCategoryMat['Category'] = itemCategoryMat['Category'].str.lower().str.strip()

# OHE the genres column
for Category in allCategory:
    itemCategoryMat[Category] = np.where(itemCategoryMat['Category'].str.contains(Category), 1, 0)
itemCategoryMat = itemCategoryMat.drop(['Category'], axis=1)
itemCategoryMat = itemCategoryMat.set_index('Asin')

# compute similarity matix
corrMat = cosine_similarity(itemCategoryMat)


# get topK similar items
ind2name = {ind:name for ind,name in enumerate(itemCategoryMat.index)}
name2ind = {v:k for k,v in ind2name.items()}
similarItems = topKItems(name2ind['B001FCK32C'],
                            topK = topK,
                            corrMat = corrMat,
                            mapName = ind2name)

# display result
print(f"The {topK} similar product to Asin B001FCK32C")
display(item_df.loc[item_df['Asin'].isin(similarItems)])

del corrMat
gc.collect()

The 5 similar product to Asin B001FCK32C


Unnamed: 0,Asin,Category,Asin_Number
136,B07BHPXVM2,bedding,225
137,B07BYMG5TV,bedding,346
138,B07C8J7766,bedding,4
147,B07F1H29FG,bedding,144
434,B0B12WVHWG,bedding,29


101

# Collaborative filtering
- 아이템 x 유저 행렬 만들어서 진행

##  Memory-based

In [132]:
def topKItems(itemId, topK, corrMat, mapName):
    
    # 상관계수 정렬 &nd topK 제품 선택
    topItems = corrMat[itemId,:].argsort()[-topK:][::-1] 
    topItems = [mapName[e] for e in topItems] 

    return topItems

In [139]:
item_df.head()

Unnamed: 0,Asin,Category,Asin_Number
0,B001FCK32C,bedding,372
1,B003PWNH4Q,bedding,234
2,B003Q0U740,bedding,100
3,B004BAEF7E,bedding,388
4,B004I3VDWY,bedding,290


In [140]:
rating_df.head()

Unnamed: 0,ID,Asin,Rating,User_Number,Asin_Number
0,Chelsea Edwards,B001FCK32C,5,7883,372
1,Atalanta,B001FCK32C,3,979,372
2,Marcos Chavez,B001FCK32C,5,24698,372
3,Bonnie Barnette,B001FCK32C,5,555,372
4,Jess Ramos,B001FCK32C,5,3676,372


In [142]:
# preprocess data
row = rating_df['User_Number']
col = rating_df['Asin_Number']
data = rating_df['Rating']

NUM_USERS = len(rating_df)
NUM_ITEMS = len(item_df)

# init user-item matrix
mat = csr_matrix((data, (row, col)), shape=(NUM_USERS, NUM_ITEMS), dtype=int)
mat.eliminate_zeros()

# calculate sparsity
sparsity = float(len(mat.nonzero()[0]))
sparsity /= (mat.shape[0] * mat.shape[1])
sparsity *= 100
print(f'Sparsity: {sparsity:4.2f}%. This means that {sparsity:4.2f}% of the user-item ratings have a value.')

# compute similarity
item_corr_mat = cosine_similarity(mat.T)

# get top k item
print(f"The {topK} similar product to Asin B001FCK32C")


similar_items = topKItems(name2ind['B001FCK32C'],
                            topK = topK,
                            corrMat = item_corr_mat,
                            mapName = ind2name)

Sparsity: 0.22%. This means that 0.22% of the user-item ratings have a value.
The 5 similar product to Asin B001FCK32C


In [147]:
display(item_df.loc[item_df['Asin'].isin(similar_items)])

Unnamed: 0,Asin,Category,Asin_Number
0,B001FCK32C,bedding,372
103,B072FC3DWR,bedding,109
252,B07VFNJL3S,bedding,112
280,B082XKFYLJ,bedding,265
285,B083LMRXJC,bedding,172


## Model based
### Matrix Factorization (MF)
#### TruncatedSVD (Sklearn)

<font color=red><U>**이거 돌릴때마다 값이 다름 이유 모름 그리고 가끔 에러남**</U></font>

In [148]:
epsilon = 1e-9
n_latent_factors = 10

# calculate item latent matrix
item_svd = TruncatedSVD(n_components = n_latent_factors)
item_features = item_svd.fit_transform(mat.transpose()) + epsilon

# calculate user latent matrix
user_svd = TruncatedSVD(n_components = n_latent_factors)
user_features = user_svd.fit_transform(mat) + epsilon

# compute similarity
item_corr_mat = cosine_similarity(item_features)

# get top k item
print(f"The {topK} similar product to Asin B001FCK32C")
similar_items = topKItems(name2ind['B001FCK32C'],
                            topK = topK,
                            corrMat = item_corr_mat,
                            mapName = ind2name)

display(item_df.loc[item_df['Asin'].isin(similar_items)])

del user_features
gc.collect();

The 5 similar product to Asin B001FCK32C


Unnamed: 0,Asin,Category,Asin_Number
0,B001FCK32C,bedding,372
360,B092VP8QSB,bedding,164
380,B097QSKS4Z,bedding,12
414,B09QCM8453,bedding,163
424,B09TSG7F4R,bedding,216


#### Funk MF (Surprise)

In [151]:
funk_rating_df = rating_df[['ID','Asin', 'Rating']]

In [152]:
def pred2dict(predictions, top_k=None):
    
    rec_dict = defaultdict(list)
    for user_id, item_id, actual_rating, pred_rating, _ in predictions:
        rec_dict[user_id].append((item_id, pred_rating))        
        
    return rec_dict

def get_top_k_recommendation(rec_dict, user_id, top_k, ind2name):
    
    pred_ratings = rec_dict[user_id]
    # sort descendingly by pred_rating
    pred_ratings = sorted(pred_ratings, key=lambda x: x[1], reverse=True)
    pred_ratings = pred_ratings[:top_k]
    recs = [ind2name[e[0]] for e in pred_ratings]
    
    return recs

# prepare train and test sets
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(funk_rating_df, reader)
train, test = train_test_split(data, test_size=.2, random_state=42)

# init and fit the funk mf model
algo = SVD(random_state = 42)
algo.fit(train)
pred = algo.test(test)

# evaluation the test set
accuracy.rmse(pred)

# extract the item features from algo
item_corr_mat = cosine_similarity(algo.qi)

print(f"The {topK} similar product to Asin B001FCK32C")
similar_items = topKItems(name2ind['B001FCK32C'],
                            topK = topK,
                            corrMat = item_corr_mat,
                            mapName = ind2name)

display(item_df.loc[item_df['Asin'].isin(similar_items)])

del item_corr_mat
gc.collect()

RMSE: 1.4791
The 5 similar product to Asin B001FCK32C


Unnamed: 0,Asin,Category,Asin_Number
0,B001FCK32C,bedding,372
105,B072XJTV7V,bedding,25
125,B078R3NDPH,bedding,319
179,B07KQBX3HY,bedding,128
212,B07PH1HLFN,bedding,73


53