In [270]:
import pandas as pd
import numpy as np
import os
import glob
import gc
from itertools import islice
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from surprise import SVD, accuracy
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.model_selection.split import train_test_split

# 데이터 준비
## 제품 데이터 준비

In [271]:
category_list = ['bath', 'bedding', 'cleaning_supplies', 
'furniture', 'heating_cooling_airquality', 'home_storage_organization', 
'irons_steamers', 'kids_home_store', 'kitchen_dining', 
'lighting_ceiling_fans', 'party_supplies', 'seasonal_decor', 
'vacuums_floor_care', 'wall_art']

In [272]:
item_df = pd.read_csv('item_meta.csv')

In [273]:
item_df.shape

(4662, 6)

In [274]:
item_df.isnull().sum()

name           0
asin           0
price          0
buylink        0
Imglink     1204
category       0
dtype: int64

In [275]:
item_df['asin'].unique().__len__()

4469

In [276]:
# item_df = item_df.drop_duplicates(subset='asin')

### Asin에 고유한 number 부여

In [277]:
asin_list = list(set(item_df['asin']))

In [278]:
asin_number_dict = {}

for idx, asin in enumerate(asin_list):
    asin_number_dict[asin] = idx

In [279]:
asin_number_list = []

for asin in item_df['asin']:
    asin_number_list.append(asin_number_dict.get(asin))

In [280]:
item_df['asin_number'] = asin_number_list
display(item_df)

print('** rating_df info **')
print(item_df.info())

Unnamed: 0,name,asin,price,buylink,Imglink,category,asin_number
0,Umbra Mini Waste Can 1-1/2 Gallon with Swing L...,B0000V09E6,14.95,https://www.amazon.com/Umbra-2-Gallon-Swing-To...,https://m.media-amazon.com/images/I/61W7yT2gU5...,bath,2357
1,"iDesign Plastic Soap Saver, Holder Tray for Ba...",B000DZFA66,4.48,https://www.amazon.com/Plastic-Soap-Holder-Bat...,https://m.media-amazon.com/images/I/51EM3UIQ4o...,bath,3699
2,"Danco 88821 2-3/4-Inch Tub Mesh Strainer, Stai...",B000DZGJX4,6.93,https://www.amazon.com/DANCO-Strainer-Stainles...,https://m.media-amazon.com/images/I/71Cv4LE0BW...,bath,3011
3,Better Living Products 76335-1 AVIVA 3 Chamber...,B000FGCW0A,40.64,https://www.amazon.com/Better-Living-Products-...,https://m.media-amazon.com/images/I/71yBzZ7gAw...,bath,2448
4,"Better Living Products, White 76354 Euro Serie...",B000FGCW0K,27.99,https://www.amazon.com/Better-Living-76354-3-C...,https://m.media-amazon.com/images/I/61rP3ZhNw1...,bath,214
...,...,...,...,...,...,...,...
4657,BTS Proof Anthology Album Compact Edition Cont...,B09ZKF44MH,28.69,https://www.amazon.com/Anthology-Compact-Conte...,https://m.media-amazon.com/images/I/31iWwSzQb3...,wall_art,476
4658,"Boho Wall Art Prints Set of 6, Mid-Century Mod...",B09ZNTCQXX,14.99,https://www.amazon.com/Mid-Century-Geometric-M...,https://m.media-amazon.com/images/I/714hr8IVuV...,wall_art,923
4659,Marcus Aurelius Poster Framed Canvas Inspirati...,B09ZV9Q3P1,28.85,https://www.amazon.com/Marcus-Aurelius-Inspira...,https://m.media-amazon.com/images/I/611Yzi97Bk...,wall_art,495
4660,Theodore Roosevelt Poster Canvas The Man In Th...,B09ZVCQ9GS,48.88,https://www.amazon.com/Theodore-Roosevelt-Post...,https://m.media-amazon.com/images/I/713bzBlkaZ...,wall_art,888


** rating_df info **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4662 entries, 0 to 4661
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         4662 non-null   object 
 1   asin         4662 non-null   object 
 2   price        4662 non-null   float64
 3   buylink      4662 non-null   object 
 4   Imglink      3458 non-null   object 
 5   category     4662 non-null   object 
 6   asin_number  4662 non-null   int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 255.1+ KB
None


In [281]:
item_df = item_df[['asin', 'category', 'asin_number']]

## 리뷰 데이터 준비
### 리뷰 데이터 합치기

In [282]:
rating_df = pd.read_csv('review_meta.csv')

In [283]:
rating_df.shape

(356733, 6)

In [284]:
rating_df.isnull().sum()

asin              0
name              1
review_date       0
rating            0
review_title      2
review_content    0
dtype: int64

In [285]:
rating_df = rating_df.dropna(subset=['name'])
rating_df = rating_df.dropna(subset=['review_title'])
rating_df.shape

(356730, 6)

### 유저 name에 고유한 number 부여

In [286]:
name_list = rating_df['name'].unique()

In [287]:
name_number_dict = {}

for idx, name in enumerate(name_list):
    name_number_dict[name] = idx

In [288]:
name_number_list = []

for name in rating_df['name']:
    name_number_list.append(name_number_dict.get(name))

In [289]:
rating_df['user_number'] = name_number_list

print('** rating_df **')
display(rating_df.head())

print('** rating_df info **')
print(rating_df.info())

** rating_df **


Unnamed: 0,asin,name,review_date,rating,review_title,review_content,user_number
0,B00004C8S8,Justin B,2019-07-15,3,Wonderfully designed product- if it worked...,This is a wonderful product in all ways except...,0
1,B00004C8S8,Jodi L. Williams,2021-06-21,5,Fourteen years! That's how long they last.,Someone bought me a Safety First infant safety...,1
2,B00004C8S8,jessica jane,2017-04-18,5,Best baby nail clippers,Save yourself time researching all the options...,2
3,B00004C8S8,Cowboy,2017-06-01,3,Pointless plastic attachment.,These are sharp and provide a good clean cut f...,3
4,B00004C8S8,GrannyEm,2017-01-02,5,Granny approved baby nail clipper,Nice and durable but gentle for baby… I don’t ...,4


** rating_df info **
<class 'pandas.core.frame.DataFrame'>
Int64Index: 356730 entries, 0 to 356732
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   asin            356730 non-null  object
 1   name            356730 non-null  object
 2   review_date     356730 non-null  object
 3   rating          356730 non-null  int64 
 4   review_title    356730 non-null  object
 5   review_content  356730 non-null  object
 6   user_number     356730 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 21.8+ MB
None


### rating_df 에서 Asin에 고유한 number 부여

In [290]:
asin_number_list2 = []

for asin in rating_df['asin']:
    asin_number_list2.append(asin_number_dict.get(asin))
    
rating_df['asin_number'] = asin_number_list2

In [291]:
rating_df.isnull().sum()

asin                  0
name                  0
review_date           0
rating                0
review_title          0
review_content        0
user_number           0
asin_number       41718
dtype: int64

In [292]:
rating_df = rating_df.dropna(subset=['asin_number'])

In [294]:
rating_df = rating_df[['name', 'asin', 'rating', 'user_number', 'asin_number']]

In [295]:
print('** rating_df Dataframe **')
display(rating_df)

print('\n** rating_df Information **')
display(rating_df.info())

** rating_df Dataframe **


Unnamed: 0,name,asin,rating,user_number,asin_number
0,Justin B,B00004C8S8,3,0,1850.0
1,Jodi L. Williams,B00004C8S8,5,1,1850.0
2,jessica jane,B00004C8S8,5,2,1850.0
3,Cowboy,B00004C8S8,3,3,1850.0
4,GrannyEm,B00004C8S8,5,4,1850.0
...,...,...,...,...,...
356728,Amazon Customer,B0B3W1HRYF,5,31,2758.0
356729,DR,B0B3W1HRYF,4,6030,2758.0
356730,jesus liendo,B0B3W1HRYF,5,32014,2758.0
356731,Amazon Customer,B0B3W1HRYF,5,31,2758.0



** rating_df Information **
<class 'pandas.core.frame.DataFrame'>
Int64Index: 315012 entries, 0 to 356732
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   name         315012 non-null  object 
 1   asin         315012 non-null  object 
 2   rating       315012 non-null  int64  
 3   user_number  315012 non-null  int64  
 4   asin_number  315012 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 14.4+ MB


None

In [296]:
rating_df = rating_df.astype({'asin_number': int})

In [297]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 315012 entries, 0 to 356732
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   name         315012 non-null  object
 1   asin         315012 non-null  object
 2   rating       315012 non-null  int64 
 3   user_number  315012 non-null  int64 
 4   asin_number  315012 non-null  int32 
dtypes: int32(1), int64(2), object(2)
memory usage: 13.2+ MB


# 추천 알고리즘

## 추천 제품 개수

In [298]:
topK = 5

### 인기 기반 구현 Popular Based
- 가중 등급 시스템(weighted rating)
- WR = ( (v / (v + m)) * R) + ( (m / (v + m)) * C )

- R: 아이템의 평균 rating

- v: 아이템에 투표한 수

- m: 인기 항목에 나열되는 데 필요한 최소 투표수

- C: 전체 데이터셋을 통해서 얻은 평균 rating

In [299]:
def weightedRating(v,m,R,C):
    return ( (v / (v + m)) * R) + ( (m / (v + m)) * C )

def assignPopularBasedScore(rating_df, item_df, user_col, item_col, rating_col):
    # pre processing
    vote_count = (
        rating_df
        .groupby(item_col,as_index=False)
        .agg( {user_col:'count', rating_col:'mean'} )
        )
    vote_count.columns = [item_col, 'vote_count', 'avg_rating']
    
    # calcuate input parameters
    C = np.mean(vote_count['avg_rating'])
    m = np.percentile(vote_count['vote_count'], 70)
    vote_count = vote_count[vote_count['vote_count'] >= m]
    R = vote_count['avg_rating']
    v = vote_count['vote_count']
    vote_count['weighted_rating'] = weightedRating(v,m,R,C)
    
    # post processing
    vote_count = vote_count.merge(item_df, on = [item_col], how = 'left')
    popular_items = vote_count.loc[:,[item_col, 'category', 'vote_count', 'avg_rating', 'weighted_rating']]
    
    return popular_items


In [300]:
# calcualte popularity based
pop_items = assignPopularBasedScore(rating_df, item_df, 'name', 'asin', 'rating')
pop_items = pop_items.sort_values('weighted_rating', ascending = False)

pop_items.reset_index(drop=True).head(10)

Unnamed: 0,asin,category,vote_count,avg_rating,weighted_rating
0,B09VMR3VTW,heating_cooling_airquality,100,4.95,4.496195
1,B00PR82UC6,home_storage_organization,100,4.94,4.491195
2,B087V8JHSD,cleaning_supplies,100,4.94,4.491195
3,B07V4GHXZX,home_storage_organization,100,4.93,4.486195
4,B01LWQMM4V,kids_home_store,100,4.92,4.481195
5,B098SGV2FL,vacuums_floor_care,100,4.92,4.481195
6,B07DJ53GZQ,kitchen_dining,100,4.92,4.481195
7,B07MFRXD6Q,party_supplies,100,4.92,4.481195
8,B01LWQMM4V,bedding,100,4.92,4.481195
9,B07QQZS9M5,irons_steamers,100,4.91,4.476195


# Content-based
- 제품과 카테고리 기반 코사인 유사도를 계산해서 추천
- 사용자가 원하는 content에 대해서 벡터화 -> 벡터화된 데이터로 유사도만 계산해주면 사용 가능

- 근데 이거는 어차페 카테고리가 다 같아서 쓸모 없을 듯

In [304]:
item_df['asin'].isin(rating_df['asin'])

0        True
1        True
2        True
3        True
4        True
        ...  
4657     True
4658     True
4659    False
4660    False
4661    False
Name: asin, Length: 4662, dtype: bool

In [312]:
def topKItems(itemId, topK, corrMat, mapName):
    
    # 상관계수 정렬 &nd topK 제품 선택
    topItemsNums = corrMat[itemId,:].argsort()[-topK:][::-1]
    topItems = []
    for e in topItemsNums:
        try:
            item = mapName[e]
            topItems.append(item)
        except:
            pass
        
    return topItems

In [313]:
# preprocessing
ratedItems = item_df.loc[item_df['asin'].isin(rating_df['asin'])].copy()

# extract the Category
Category = ratedItems['category'].str.split(",", expand=True)

# get all possible Category
allCategory = set()
for c in Category.columns:
    distinctCategory = Category[c].str.lower().str.strip().unique()
    allCategory.update(distinctCategory)

# create item-Category matrix
itemCategoryMat = ratedItems[['asin', 'category']].copy()
itemCategoryMat['category'] = itemCategoryMat['category'].str.lower().str.strip()

# OHE the genres column
for Category in allCategory:
    itemCategoryMat[Category] = np.where(itemCategoryMat['category'].str.contains(Category), 1, 0)
itemCategoryMat = itemCategoryMat.drop(['category'], axis=1)
itemCategoryMat = itemCategoryMat.set_index('asin')

# compute similarity matix
corrMat = cosine_similarity(itemCategoryMat)


# get topK similar items
ind2name = {ind:name for ind,name in enumerate(itemCategoryMat.index)}
name2ind = {v:k for k,v in ind2name.items()}
similarItems = topKItems(name2ind['B00004C8S8'],
                            topK = topK,
                            corrMat = corrMat,
                            mapName = ind2name)

# display result
print(f"The {topK} similar product to Asin B00004C8S8")
display(item_df.loc[item_df['asin'].isin(similarItems)])

del corrMat
gc.collect()

The 5 similar product to Asin B00004C8S8


Unnamed: 0,asin,category,asin_number
399,B07K8QBMLY,bedding,1192
1225,B07KWRTRFF,furniture,4399
1323,B07KBSPCVX,furniture,890
2903,B07K8DD2MR,kids_home_store,3582
2904,B07K8QBMLY,kids_home_store,1192
2905,B07KBSPCVX,kids_home_store,890
2910,B07KWRTRFF,kids_home_store,4399
2965,B0811SP2P3,kids_home_store,681


101

# Collaborative filtering
- 아이템 x 유저 행렬 만들어서 진행

##  Memory-based

In [314]:
# preprocess data
row = rating_df['user_number']
col = rating_df['asin_number']
data = rating_df['rating']

NUM_USERS = len(rating_df)
NUM_ITEMS = len(item_df)

# init user-item matrix
mat = csr_matrix((data, (row, col)), shape=(NUM_USERS, NUM_ITEMS), dtype=int)
mat.eliminate_zeros()

# calculate sparsity
sparsity = float(len(mat.nonzero()[0]))
sparsity /= (mat.shape[0] * mat.shape[1])
sparsity *= 100
print(f'Sparsity: {sparsity:4.2f}%. This means that {sparsity:4.2f}% of the user-item ratings have a value.')

# compute similarity
item_corr_mat = cosine_similarity(mat.T)

# get top k item
print(f"The {topK} similar product to Asin B00004C8S8")

similar_items = topKItems(name2ind['B000DZGJX4'],
                            topK = topK,
                            corrMat = item_corr_mat,
                            mapName = ind2name)

display(item_df.loc[item_df['asin'].isin(similar_items)])

Sparsity: 0.02%. This means that 0.02% of the user-item ratings have a value.
The 5 similar product to Asin B00004C8S8


Unnamed: 0,asin,category,asin_number
1602,B07VHP682L,heating_cooling_airquality,661
1635,B082NX8GTF,heating_cooling_airquality,657
1637,B083J36134,heating_cooling_airquality,4208
1639,B083Q2CPXB,heating_cooling_airquality,2257


In [316]:
# 에러 확인

# key_list = list(name2ind.keys())
# for idx, key in enumerate(key_list):
#     try:
#         topKItems(name2ind[key], topK = topK, corrMat = item_corr_mat, mapName = ind2name)
#     except:
#         print(idx, 'error')

## Model based
### Matrix Factorization (MF)
#### TruncatedSVD (Sklearn)

<font color=red><U>**이거 돌릴때마다 값이 다름 이유 모름 그리고 가끔 에러남**</U></font>

In [317]:
epsilon = 1e-9
n_latent_factors = 10

# calculate item latent matrix
item_svd = TruncatedSVD(n_components = n_latent_factors)
item_features = item_svd.fit_transform(mat.transpose()) + epsilon

# calculate user latent matrix
user_svd = TruncatedSVD(n_components = n_latent_factors)
user_features = user_svd.fit_transform(mat) + epsilon

# compute similarity
item_corr_mat = cosine_similarity(item_features)

# get top k item
print(f"The {topK} similar product to Asin B00004C8S8")
similar_items = topKItems(name2ind['B000DZGJX4'],
                            topK = topK,
                            corrMat = item_corr_mat,
                            mapName = ind2name)

display(item_df.loc[item_df['asin'].isin(similar_items)])

del user_features
gc.collect()

The 5 similar product to Asin B00004C8S8


Unnamed: 0,asin,category,asin_number
1272,B08L1M6VLD,furniture,1783
1359,B01B2A969C,furniture,2103
2837,B01B2A969C,kids_home_store,2103
3004,B08CTJ4TJY,kids_home_store,1924
3400,B09PTY7YP9,kitchen_dining,4361
4252,B07RDBG21C,wall_art,465


7

#### Funk MF (Surprise)

In [318]:
funk_rating_df = rating_df[['name', 'asin', 'rating']]

In [319]:
def pred2dict(predictions, top_k=None):
    
    rec_dict = defaultdict(list)
    for user_id, item_id, actual_rating, pred_rating, _ in predictions:
        rec_dict[user_id].append((item_id, pred_rating))        
        
    return rec_dict

def get_top_k_recommendation(rec_dict, user_id, top_k, ind2name):
    
    pred_ratings = rec_dict[user_id]
    # sort descendingly by pred_rating
    pred_ratings = sorted(pred_ratings, key=lambda x: x[1], reverse=True)
    pred_ratings = pred_ratings[:top_k]
    recs = [ind2name[e[0]] for e in pred_ratings]
    
    return recs

In [320]:
# prepare train and test sets
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(funk_rating_df, reader)
train, test = train_test_split(data, test_size=.2, random_state=42)

# init and fit the funk mf model
algo = SVD(random_state = 42)
algo.fit(train)
pred = algo.test(test)

# evaluation the test set
accuracy.rmse(pred)

# extract the item features from algo
item_corr_mat = cosine_similarity(algo.qi)

print(f"The {topK} similar product to Asin B00004C8S8")
similar_items = topKItems(name2ind['B000DZGJX4'],
                            topK = topK,
                            corrMat = item_corr_mat,
                            mapName = ind2name)

display(item_df.loc[item_df['asin'].isin(similar_items)])

del item_corr_mat
gc.collect()

RMSE: 1.4274
The 5 similar product to Asin B00004C8S8


Unnamed: 0,asin,category,asin_number
2,B000DZGJX4,bath,3011
493,B07ZCS8LJS,bedding,179
3498,B09VGKJY2D,party_supplies,2470
3744,B06XS47ZS5,party_supplies,3831
4151,B00AAHXUO6,vacuums_floor_care,1543


0