In [1]:
import os
import pandas as pd
import glob
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gc
from itertools import islice
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from surprise import SVD, accuracy
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.model_selection.split import train_test_split


# 데이터 준비
## 제품 데이터 준비

In [2]:
items = pd.read_csv('E://Amazon review data (2018)/100. amazon product data/cleaningSupplies.csv')
items['Category'] = 'cleaningSupplies'
items = items[['Asin','Category']]

print('Item Dataframe')
display(items)

Item Dataframe


Unnamed: 0,Asin,Category
0,B00004OCIZ,cleaningSupplies
1,B00004OCLJ,cleaningSupplies
2,B00094EWE2,cleaningSupplies
3,B000CC91GK,cleaningSupplies
4,B000V6YLNA,cleaningSupplies
...,...,...
358,B09X9RY3DG,cleaningSupplies
359,B09Y8RYLC4,cleaningSupplies
360,B09YHNYFBP,cleaningSupplies
361,B0B18KJF62,cleaningSupplies


## 리뷰 데이터 준비
### 리뷰 데이터 합치기

In [3]:
files_joined = os.path.join('E://Amazon review data (2018)/100. amazon review data/cleaning_supplies/','*.csv')

# join된 모든 파일 리스트 반환
list_files = glob.glob(files_joined)

print("** multiple csv files -> a merged dataFrame **")
# join된 모든 파일 merge
ratings = pd.concat(map(pd.read_csv, list_files), ignore_index=True)
display(ratings)

** multiple csv files -> a merged dataFrame **


Unnamed: 0.1,Unnamed: 0,Asin,ID,Date,Rating,Title,Body
0,0,B00004OCIZ,J. L.,6202022,1,Bristles Flatten Quickly,Bristles flattened after a month or two of typ...
1,1,B00004OCIZ,Stephen Bolster,6182022,3,I hoped this worked better for the price,The bristles are already deforming after a mo...
2,2,B00004OCIZ,Amazon Customer,6182022,2,Weak bristles.,Bristles are too flimsy to scrub with. Barely...
3,3,B00004OCIZ,Tabletopper,6172022,1,Bad for the environment,Not a good choice. Big hunk of plastic that wi...
4,4,B00004OCIZ,Eric,6172022,1,Bristles went flat in a week,"Bristles are not heat resistant, pan must be c..."
...,...,...,...,...,...,...,...
32757,23,B0B3HTNL38,nyla martens,6222020,5,"Right price, right product.",I keep a box of these in my car and wear them ...
32758,24,B0B3HTNL38,Amazon Customer,6142020,5,Good,Works just as expected
32759,25,B0B3HTNL38,peter souza,642020,5,Perfect for the use and needs we have.,One of the few vendors that had products avail...
32760,26,B0B3HTNL38,Micheal,622020,1,Don't waste your money/Company will harass you,"These are the worst gloves I have ever used, t..."


# 리뷰 데이터 정제

In [4]:
ratings = ratings[['ID', 'Asin', 'Rating']]

print("ID: 사용자 id","\nAsin: Amazon Standard Identification Number", "\nRating: 사용자 별점")

display(ratings)

ID: 사용자 id 
Asin: Amazon Standard Identification Number 
Rating: 사용자 별점


Unnamed: 0,ID,Asin,Rating
0,J. L.,B00004OCIZ,1
1,Stephen Bolster,B00004OCIZ,3
2,Amazon Customer,B00004OCIZ,2
3,Tabletopper,B00004OCIZ,1
4,Eric,B00004OCIZ,1
...,...,...,...
32757,nyla martens,B0B3HTNL38,5
32758,Amazon Customer,B0B3HTNL38,5
32759,peter souza,B0B3HTNL38,5
32760,Micheal,B0B3HTNL38,1


## 유저 Id에 고유한 number 부여

In [5]:
idList = list(set(ratings['ID']))
idNumber = {}
idx = 0

for i in idList:
    idNumber[i] = idx
    idx += 1

print("idNumber 10개만 확인")
display(dict(islice(idNumber.items(), 10)))

idNumber 10개만 확인


{nan: 0,
 'On-the-go Mom': 1,
 'Tabatha Boatwright': 2,
 'Aj': 3,
 'MargieMix': 4,
 'Zachary Fedak': 5,
 'kimberly': 6,
 'Robert T.': 7,
 'CeeJay': 8,
 'Katar': 9}

In [6]:
idNumberList = []
for i in ratings['ID']:
    idNumberList.append(idNumber.get(i))
    
print("idNumberList 10개만 확인")
display(idNumberList[:10])

idNumberList 10개만 확인


[20936, 20612, 953, 9190, 23952, 9678, 16652, 13559, 14732, 953]

In [7]:
ratings['user'] = idNumberList
display(ratings.head())

Unnamed: 0,ID,Asin,Rating,user
0,J. L.,B00004OCIZ,1,20936
1,Stephen Bolster,B00004OCIZ,3,20612
2,Amazon Customer,B00004OCIZ,2,953
3,Tabletopper,B00004OCIZ,1,9190
4,Eric,B00004OCIZ,1,23952


## Asin에 고유한 numbr 부여

In [8]:
asinList = list(set(items['Asin']))
asinNumber = {}
idx = 0

for i in asinList:
    asinNumber[i] = idx
    idx += 1
        
print("asinNumber 10개만 확인")
display(dict(islice(asinNumber.items(), 10)))

asinNumber 10개만 확인


{'B01ES6C8GA': 0,
 'B0895XZJHZ': 1,
 'B088NKJT4L': 2,
 'B07176372T': 3,
 'B006VABXBS': 4,
 'B01LWL7D3U': 5,
 'B07JCB5WRV': 6,
 'B00JEQG4X8': 7,
 'B07TC7B9RS': 8,
 'B09PC1W676': 9}

In [9]:
asinNumberList = []
for i in ratings['Asin']:
    asinNumberList.append(asinNumber.get(i))
    
ratings['AsinNumber'] = asinNumberList
print('ratings Dataframe')
display(ratings)

ratings Dataframe


Unnamed: 0,ID,Asin,Rating,user,AsinNumber
0,J. L.,B00004OCIZ,1,20936,308
1,Stephen Bolster,B00004OCIZ,3,20612,308
2,Amazon Customer,B00004OCIZ,2,953,308
3,Tabletopper,B00004OCIZ,1,9190,308
4,Eric,B00004OCIZ,1,23952,308
...,...,...,...,...,...
32757,nyla martens,B0B3HTNL38,5,1863,280
32758,Amazon Customer,B0B3HTNL38,5,953,280
32759,peter souza,B0B3HTNL38,5,20409,280
32760,Micheal,B0B3HTNL38,1,17278,280


In [10]:
ratings

Unnamed: 0,ID,Asin,Rating,user,AsinNumber
0,J. L.,B00004OCIZ,1,20936,308
1,Stephen Bolster,B00004OCIZ,3,20612,308
2,Amazon Customer,B00004OCIZ,2,953,308
3,Tabletopper,B00004OCIZ,1,9190,308
4,Eric,B00004OCIZ,1,23952,308
...,...,...,...,...,...
32757,nyla martens,B0B3HTNL38,5,1863,280
32758,Amazon Customer,B0B3HTNL38,5,953,280
32759,peter souza,B0B3HTNL38,5,20409,280
32760,Micheal,B0B3HTNL38,1,17278,280


In [11]:
asinNumberList = []
for i in items['Asin']:
    asinNumberList.append(asinNumber.get(i))
items['AsinNumber'] = asinNumberList
print('Item Dataframe')
display(items)

Item Dataframe


Unnamed: 0,Asin,Category,AsinNumber
0,B00004OCIZ,cleaningSupplies,308
1,B00004OCLJ,cleaningSupplies,128
2,B00094EWE2,cleaningSupplies,187
3,B000CC91GK,cleaningSupplies,250
4,B000V6YLNA,cleaningSupplies,86
...,...,...,...
358,B09X9RY3DG,cleaningSupplies,29
359,B09Y8RYLC4,cleaningSupplies,37
360,B09YHNYFBP,cleaningSupplies,255
361,B0B18KJF62,cleaningSupplies,358


# 추천 아이템 갯수

In [12]:
topK = 5

# 인기 기반 구현 Popular Based
- 가중 등급 시스템(weighted rating)
- WR = ( (v / (v + m)) * R) + ( (m / (v + m)) * C )

- R: 아이템의 평균 rating

- v: 아이템에 투표한 수

- m: 인기 항목에 나열되는 데 필요한 최소 투표수

- C: 전체 데이터셋을 통해서 얻은 평균 rating

In [13]:
def weightedRating(v,m,R,C):

    return ( (v / (v + m)) * R) + ( (m / (v + m)) * C )

def assignPopularBasedScore(ratingDf, itemDf, userCol, itemCol, ratingCol):
    # pre processing
    voteCount = (
        ratingDf
        .groupby(itemCol,as_index=False)
        .agg( {userCol:'count', ratingCol:'mean'} )
        )
    voteCount.columns = [itemCol, 'voteCount', 'avg_rating']
    
    # calcuate input parameters
    C = np.mean(voteCount['avg_rating'])
    m = np.percentile(voteCount['voteCount'], 70)
    voteCount = voteCount[voteCount['voteCount'] >= m]
    R = voteCount['avg_rating']
    v = voteCount['voteCount']
    voteCount['weightedRating'] = weightedRating(v,m,R,C)
    
    # post processing
    voteCount = voteCount.merge(itemDf, on = [itemCol], how = 'left')
    popular_items = voteCount.loc[:,[itemCol, 'Category', 'voteCount', 'avg_rating', 'weightedRating']]
    
    return popular_items

# calcualte popularity based
pop_items = assignPopularBasedScore(ratings, items, 'ID', 'Asin', 'Rating')
pop_items = pop_items.sort_values('weightedRating', ascending = False)

pop_items.head(10)

Unnamed: 0,Asin,Category,voteCount,avg_rating,weightedRating
219,B087V8JHSD,cleaningSupplies,100,4.94,4.461953
257,B08SMLNWX1,cleaningSupplies,100,4.89,4.436953
30,B00B7ALYMK,cleaningSupplies,100,4.86,4.421953
173,B07TNT37W4,cleaningSupplies,100,4.83,4.406953
154,B07PWRP7SZ,cleaningSupplies,100,4.81,4.396953
46,B00KAJ42GO,cleaningSupplies,100,4.81,4.396953
69,B01B5ECU3O,cleaningSupplies,100,4.81,4.396953
94,B0711HBWJF,cleaningSupplies,100,4.8,4.391953
99,B073WYBGP3,cleaningSupplies,100,4.77,4.376953
54,B010B2LK7S,cleaningSupplies,100,4.76,4.371953


# Content-based
- 제품과 카테고리 기반 코사인 유사도를 계산해서 추천
- 사용자가 원하는 content에 대해서 벡터화 -> 벡터화된 데이터로 유사도만 계산해주면 사용 가능

- 근데 이거는 어차페 카테고리가 다 같아서 쓸모 없을 듯

In [14]:
items

Unnamed: 0,Asin,Category,AsinNumber
0,B00004OCIZ,cleaningSupplies,308
1,B00004OCLJ,cleaningSupplies,128
2,B00094EWE2,cleaningSupplies,187
3,B000CC91GK,cleaningSupplies,250
4,B000V6YLNA,cleaningSupplies,86
...,...,...,...
358,B09X9RY3DG,cleaningSupplies,29
359,B09Y8RYLC4,cleaningSupplies,37
360,B09YHNYFBP,cleaningSupplies,255
361,B0B18KJF62,cleaningSupplies,358


In [15]:
items

Unnamed: 0,Asin,Category,AsinNumber
0,B00004OCIZ,cleaningSupplies,308
1,B00004OCLJ,cleaningSupplies,128
2,B00094EWE2,cleaningSupplies,187
3,B000CC91GK,cleaningSupplies,250
4,B000V6YLNA,cleaningSupplies,86
...,...,...,...
358,B09X9RY3DG,cleaningSupplies,29
359,B09Y8RYLC4,cleaningSupplies,37
360,B09YHNYFBP,cleaningSupplies,255
361,B0B18KJF62,cleaningSupplies,358


In [16]:
ratings

Unnamed: 0,ID,Asin,Rating,user,AsinNumber
0,J. L.,B00004OCIZ,1,20936,308
1,Stephen Bolster,B00004OCIZ,3,20612,308
2,Amazon Customer,B00004OCIZ,2,953,308
3,Tabletopper,B00004OCIZ,1,9190,308
4,Eric,B00004OCIZ,1,23952,308
...,...,...,...,...,...
32757,nyla martens,B0B3HTNL38,5,1863,280
32758,Amazon Customer,B0B3HTNL38,5,953,280
32759,peter souza,B0B3HTNL38,5,20409,280
32760,Micheal,B0B3HTNL38,1,17278,280


In [17]:
def topKItems(itemId, topK, corrMat, mapName):
    
    # 상관계수 정렬 &nd topK 제품 선택
    topItems = corrMat[itemId,:].argsort()[-topK:][::-1] 
    topItems = [mapName[e] for e in topItems] 

    return topItems

# preprocessing
ratedItems = items.loc[items['Asin'].isin(ratings['Asin'])].copy()

# extract the Category
Category = ratedItems['Category'].str.split(",", expand=True)

# get all possible Category
allCategory = set()
for c in Category.columns:
    distinctCategory = Category[c].str.lower().str.strip().unique()
    allCategory.update(distinctCategory)

# create item-Category matrix
itemCategoryMat = ratedItems[['Asin', 'Category']].copy()
itemCategoryMat['Category'] = itemCategoryMat['Category'].str.lower().str.strip()

# OHE the genres column
for Category in allCategory:
    itemCategoryMat[Category] = np.where(itemCategoryMat['Category'].str.contains(Category), 1, 0)
itemCategoryMat = itemCategoryMat.drop(['Category'], axis=1)
itemCategoryMat = itemCategoryMat.set_index('Asin')

# compute similarity matix
corrMat = cosine_similarity(itemCategoryMat)


# get topK similar items
ind2name = {ind:name for ind,name in enumerate(itemCategoryMat.index)}
name2ind = {v:k for k,v in ind2name.items()}
similarItems = topKItems(name2ind['B00004OCIZ'],
                            topK = topK,
                            corrMat = corrMat,
                            mapName = ind2name)

# display result
print(f"The {topK} similar product to Asin B00004OCIZ")
display(items.loc[items['Asin'].isin(similarItems)])

del corrMat
gc.collect();

The 5 similar product to Asin B00004OCIZ


Unnamed: 0,Asin,Category,AsinNumber
112,B07515TW8M,cleaningSupplies,49
113,B075FHF7TY,cleaningSupplies,318
114,B07622VSV3,cleaningSupplies,11
132,B07DBC5DYP,cleaningSupplies,91
362,B0B3HTNL38,cleaningSupplies,280


# Collaborative filtering
- 아이템 x 유저 행렬 만들어서 진행

##  Memory-based

In [18]:
ratings

Unnamed: 0,ID,Asin,Rating,user,AsinNumber
0,J. L.,B00004OCIZ,1,20936,308
1,Stephen Bolster,B00004OCIZ,3,20612,308
2,Amazon Customer,B00004OCIZ,2,953,308
3,Tabletopper,B00004OCIZ,1,9190,308
4,Eric,B00004OCIZ,1,23952,308
...,...,...,...,...,...
32757,nyla martens,B0B3HTNL38,5,1863,280
32758,Amazon Customer,B0B3HTNL38,5,953,280
32759,peter souza,B0B3HTNL38,5,20409,280
32760,Micheal,B0B3HTNL38,1,17278,280


In [19]:
items

Unnamed: 0,Asin,Category,AsinNumber
0,B00004OCIZ,cleaningSupplies,308
1,B00004OCLJ,cleaningSupplies,128
2,B00094EWE2,cleaningSupplies,187
3,B000CC91GK,cleaningSupplies,250
4,B000V6YLNA,cleaningSupplies,86
...,...,...,...
358,B09X9RY3DG,cleaningSupplies,29
359,B09Y8RYLC4,cleaningSupplies,37
360,B09YHNYFBP,cleaningSupplies,255
361,B0B18KJF62,cleaningSupplies,358


In [20]:
# preprocess data
row = ratings['user']
col = ratings['AsinNumber']
data = ratings['Rating']

NUM_USERS = len(ratings)
NUM_ITEMS = len(items)

# init user-item matrix
mat = csr_matrix((data, (row, col)), shape=(NUM_USERS, NUM_ITEMS), dtype=int)
mat.eliminate_zeros()

# calculate sparsity
sparsity = float(len(mat.nonzero()[0]))
sparsity /= (mat.shape[0] * mat.shape[1])
sparsity *= 100
print(f'Sparsity: {sparsity:4.2f}%. This means that {sparsity:4.2f}% of the user-item ratings have a value.')

# compute similarity
item_corr_mat = cosine_similarity(mat.T)

# get top k item
print(f"The {topK} similar product to Asin B00004OCIZ")


similar_items = topKItems(name2ind['B00004OCIZ'],
                            topK = topK,
                            corrMat = item_corr_mat,
                            mapName = ind2name)


display(items.loc[items['Asin'].isin(similar_items)])


Sparsity: 0.26%. This means that 0.26% of the user-item ratings have a value.
The 5 similar product to Asin B00004OCIZ


Unnamed: 0,Asin,Category,AsinNumber
0,B00004OCIZ,cleaningSupplies,308
64,B010OW4KMW,cleaningSupplies,201
81,B01ES6C8GA,cleaningSupplies,0
131,B07CNQBPCG,cleaningSupplies,24
327,B09G9B3VHS,cleaningSupplies,262


## Model based
### Matrix Factorization (MF)
#### TruncatedSVD (Sklearn)

<font color=red><U>**이거 돌릴때마다 값이 다름 이유 모름 그리고 가끔 에러남**</U></font>

In [21]:
items

Unnamed: 0,Asin,Category,AsinNumber
0,B00004OCIZ,cleaningSupplies,308
1,B00004OCLJ,cleaningSupplies,128
2,B00094EWE2,cleaningSupplies,187
3,B000CC91GK,cleaningSupplies,250
4,B000V6YLNA,cleaningSupplies,86
...,...,...,...
358,B09X9RY3DG,cleaningSupplies,29
359,B09Y8RYLC4,cleaningSupplies,37
360,B09YHNYFBP,cleaningSupplies,255
361,B0B18KJF62,cleaningSupplies,358


In [22]:
epsilon = 1e-9
n_latent_factors = 10

# calculate item latent matrix
item_svd = TruncatedSVD(n_components = n_latent_factors)
item_features = item_svd.fit_transform(mat.transpose()) + epsilon

# calculate user latent matrix
user_svd = TruncatedSVD(n_components = n_latent_factors)
user_features = user_svd.fit_transform(mat) + epsilon

# compute similarity
item_corr_mat = cosine_similarity(item_features)

# get top k item
print(f"The {topK} similar product to Asin B00004OCIZ")
similar_items = topKItems(name2ind['B00004OCIZ'],
                            topK = topK,
                            corrMat = item_corr_mat,
                            mapName = ind2name)

display(items.loc[items['Asin'].isin(similar_items)])

del user_features
gc.collect();

The 5 similar product to Asin B00004OCIZ


Unnamed: 0,Asin,Category,AsinNumber
0,B00004OCIZ,cleaningSupplies,308
75,B01BZ0N69U,cleaningSupplies,233
108,B0746MWFWJ,cleaningSupplies,112
129,B07CLDNYWH,cleaningSupplies,125
166,B07PVBD58K,cleaningSupplies,147


#### Funk MF (Surprise)

In [23]:
ratings

Unnamed: 0,ID,Asin,Rating,user,AsinNumber
0,J. L.,B00004OCIZ,1,20936,308
1,Stephen Bolster,B00004OCIZ,3,20612,308
2,Amazon Customer,B00004OCIZ,2,953,308
3,Tabletopper,B00004OCIZ,1,9190,308
4,Eric,B00004OCIZ,1,23952,308
...,...,...,...,...,...
32757,nyla martens,B0B3HTNL38,5,1863,280
32758,Amazon Customer,B0B3HTNL38,5,953,280
32759,peter souza,B0B3HTNL38,5,20409,280
32760,Micheal,B0B3HTNL38,1,17278,280


In [24]:
funkRatings = ratings[['ID','Asin', 'Rating']]

In [25]:
def pred2dict(predictions, top_k=None):
    
    rec_dict = defaultdict(list)
    for user_id, item_id, actual_rating, pred_rating, _ in predictions:
        rec_dict[user_id].append((item_id, pred_rating))        
        
    return rec_dict

def get_top_k_recommendation(rec_dict, user_id, top_k, ind2name):
    
    pred_ratings = rec_dict[user_id]
    # sort descendingly by pred_rating
    pred_ratings = sorted(pred_ratings, key=lambda x: x[1], reverse=True)
    pred_ratings = pred_ratings[:top_k]
    recs = [ind2name[e[0]] for e in pred_ratings]
    
    return recs

# prepare train and test sets
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(funkRatings, reader)
train, test = train_test_split(data, test_size=.2, random_state=42)

# init and fit the funk mf model
algo = SVD(random_state = 42)
algo.fit(train)
pred = algo.test(test);

# evaluation the test set
accuracy.rmse(pred)

# extract the item features from algo
item_corr_mat = cosine_similarity(algo.qi)

print(f"The {topK} similar product to Asin B00004OCIZ")
similar_items = topKItems(name2ind['B00004OCIZ'],
                            topK = topK,
                            corrMat = item_corr_mat,
                            mapName = ind2name)

display(items.loc[items['Asin'].isin(similar_items)])

del item_corr_mat
gc.collect();

RMSE: 1.4368
The 5 similar product to Asin B00004OCIZ


Unnamed: 0,Asin,Category,AsinNumber
0,B00004OCIZ,cleaningSupplies,308
35,B00B7ALYMK,cleaningSupplies,47
39,B00DH4IQSE,cleaningSupplies,114
170,B07Q3NYBG1,cleaningSupplies,355
196,B07WGKQVN7,cleaningSupplies,72
