<a href="https://colab.research.google.com/github/DobiIsFree/gongboohater/blob/main/Recommender_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recommender Systems

- content-based filtering
  - 지금까지 사용자의 이전 행동과 명시적 피드백을 통해 사용자가 좋아하는 것과 유사한 항목 추천

- collaborative filtering
  - 사용자와 항목간의 유사성을 동시에 사용해 추천


## Surprise
- 추천 시스템 개발을 위한 라이브러리
- 다양한 모델과 데이터 제공
- scikit-learn과 유사한 사용 방법

## Import packages

In [2]:
! pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3095447 sha256=bb7ff13d03d6ad3230805ccb6c67c2bbc97d92f31c56cd66955c3d4c738ff051
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.

In [3]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [4]:
data = Dataset.load_builtin('ml-100k', prompt=False)
data.raw_ratings[:10] # user / item / rate / userID

Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [5]:
model = SVD()

In [7]:
cross_validate(model, data, measures=['rmse', 'mae'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9327  0.9258  0.9450  0.9306  0.9439  0.9356  0.0076  
MAE (testset)     0.7372  0.7324  0.7438  0.7333  0.7417  0.7377  0.0045  
Fit time          1.25    0.85    0.96    1.10    0.83    1.00    0.16    
Test time         0.09    0.09    0.14    0.09    0.16    0.12    0.03    


{'test_rmse': array([0.93265167, 0.92580268, 0.94500554, 0.93060937, 0.94386648]),
 'test_mae': array([0.73718432, 0.73238873, 0.74381199, 0.73333843, 0.74170072]),
 'fit_time': (1.2537081241607666,
  0.8549182415008545,
  0.9569909572601318,
  1.102433681488037,
  0.8251199722290039),
 'test_time': (0.09450745582580566,
  0.09249091148376465,
  0.14126014709472656,
  0.08973979949951172,
  0.159470796585083)}

## Content-based Filtering

* 컨텐츠 기반 필터링은 이전의 행동과 명시적 피드백을 통해 좋아하는 것과 유사한 항목을 추천
  * ex) 내가 지금 까지 시청한 영화 목록과 다른 사용자의 시청 목록을 비교해 나와 비슷한 취향의 사용자가 시청한 영화를 추천
* 유사도를 기반으로 추천

* 장점
  * 많은 수의 사용자를 대상으로 쉽게 확장 가능
  * 사용자가 관심을 갖지 않던 상품 추천 가능

* 단점
  * 입력 특성을 직접 설계해야 하기 때문에 많은 도메인 지식이 필요
  * 사용자의 기존 관심사항을 기반으로만 추천 가능

* 이진 벡터의 내적을 통해 다른 사용자들과의 유사도 구하기
* 나와 가장 높은 유사도를 가진 사용자의 시청 목록을 추천

In [8]:
import numpy as np
from surprise import Dataset

In [9]:
data = Dataset.load_builtin('ml-100k', prompt=False)
raw_data = np.array(data.raw_ratings, dtype=int)

In [10]:
raw_data[:, 0] -= 1 # 0부터 시작하도록 조정
raw_data[:, 1] -= 1

In [12]:
# 인접행렬 크기 구하기: user 수
n_users = np.max(raw_data[:, 0])
n_movies = np.max(raw_data[:, 1])
shape = (n_users + 1, n_movies + 1)
shape

(943, 1682)

In [13]:
# 인접행렬 만들기
adj_matrix = np.ndarray(shape, dtype=int)
for user_id, movie_id, rating, time in raw_data:
  adj_matrix[user_id][movie_id] = 1.

adj_matrix

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [16]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    similarity = np.dot(my_vector, user_vector)
    if similarity > best_match:
      best_match = similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match {} Best Match ID: {}'.format(best_match, best_match_id) )

Best Match 183 Best Match ID: 275


In [17]:
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
  log1, log2 = log
  if log1 < 1. and log2 > 0.:
    recommend_list.append(i)

print(recommend_list)

[272, 273, 275, 280, 281, 283, 287, 288, 289, 290, 292, 293, 297, 299, 300, 301, 302, 306, 312, 314, 315, 316, 317, 321, 322, 323, 324, 327, 330, 331, 332, 333, 339, 342, 345, 346, 353, 354, 355, 356, 357, 363, 364, 365, 366, 372, 374, 378, 379, 381, 382, 383, 384, 385, 386, 387, 390, 391, 392, 394, 395, 396, 398, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 414, 416, 417, 418, 419, 420, 422, 424, 425, 426, 427, 428, 430, 431, 432, 435, 442, 446, 447, 448, 449, 450, 451, 452, 454, 455, 457, 460, 461, 462, 468, 469, 470, 471, 472, 473, 474, 478, 495, 500, 507, 517, 522, 525, 530, 539, 540, 543, 545, 546, 548, 549, 550, 551, 553, 557, 558, 560, 561, 562, 563, 565, 566, 567, 568, 570, 571, 574, 575, 576, 577, 580, 581, 582, 585, 587, 589, 590, 594, 596, 602, 623, 626, 627, 630, 633, 635, 639, 646, 648, 651, 652, 654, 657, 664, 668, 671, 677, 678, 681, 683, 684, 685, 690, 691, 692, 695, 696, 708, 709, 714, 718, 719, 720, 724, 726, 727, 731, 733, 734, 736, 738, 741, 742, 745,

* 유클리드 거리를 사용해 추천
$$euclidean = \sqrt{\sum_{d=1}^{D}(A_i - B_i)^2}$$
* 거리가 가까울 수록(값이 작을 수록) 나와 유사한 사용자

In [19]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = 9999, -1, []

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    euclidean_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
    if euclidean_dist < best_match:
      best_match = euclidean_dist
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match {} Best Match ID: {}'.format(best_match, best_match_id) )

Best Match 14.832396974191326 Best Match ID: 737


In [20]:
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
  log1, log2 = log
  if log1 < 1. and log2 > 0.:
    recommend_list.append(i)

print(recommend_list)

[297, 312, 317, 342, 356, 366, 379, 384, 392, 402, 404, 407, 417, 422, 428, 433, 448, 454, 469, 473, 495, 510, 516, 526, 527, 549, 567, 602, 635, 649, 650, 654, 658, 661, 664, 696, 731, 746, 750, 754, 915, 918, 925, 929, 950, 968, 1015, 1046]


* 코사인 유사도를 사용해 추천

\begin{equation}
cos \theta = \frac{A \cdot B}{||A|| \times ||B||}
\end{equation}
* 두 벡터가 이루고 있는 각을 계산

In [21]:
def compute_cos_similarity(v1, v2):
  norm1 = np.sqrt(np.sum(np.square(v1)))
  norm2 = np.sqrt(np.sum(np.square(v2)))
  dot = np.dot(v1, v2)

  return dot / (norm1 * norm2)

In [22]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    cos_similarity = compute_cos_similarity(my_vector, user_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match {} Best Match ID: {}'.format(best_match, best_match_id) )

Best Match 0.5278586163659506 Best Match ID: 915


In [23]:
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
  log1, log2 = log
  if log1 < 1. and log2 > 0.:
    recommend_list.append(i)

print(recommend_list)

[272, 275, 279, 280, 283, 285, 289, 294, 297, 316, 317, 355, 365, 366, 368, 379, 380, 381, 384, 386, 392, 398, 401, 404, 416, 420, 422, 424, 426, 427, 430, 432, 450, 460, 461, 466, 469, 471, 473, 474, 475, 479, 482, 483, 497, 505, 508, 510, 511, 522, 526, 527, 529, 530, 534, 536, 540, 545, 548, 549, 556, 557, 558, 560, 565, 567, 568, 569, 577, 580, 581, 582, 592, 596, 630, 635, 639, 641, 649, 651, 654, 673, 677, 678, 683, 684, 692, 696, 701, 703, 707, 708, 709, 712, 714, 719, 720, 726, 731, 734, 736, 738, 740, 745, 747, 754, 755, 761, 762, 763, 766, 780, 789, 791, 805, 819, 823, 824, 830, 843, 862, 865, 918, 929, 930, 938, 942, 943, 947, 958, 959, 960, 970, 977, 1004, 1008, 1009, 1010, 1013, 1041, 1045, 1069, 1072, 1073, 1078, 1097, 1100, 1108, 1112, 1118, 1134, 1193, 1205, 1207, 1216, 1219, 1267, 1334, 1400, 1427, 1596, 1681]


- 기존 방법에 명시적 피드백(사용자가 평가한 영화점수) 추가해 실험

In [25]:
adj_matrix = np.ndarray(shape, dtype=int)

for user_id, movie_id, rating, time in raw_data:
  adj_matrix[user_id][movie_id] = rating

adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [26]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = 9999, -1, []

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    euclidean_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
    if euclidean_dist < best_match:
      best_match = euclidean_dist
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match {} Best Match ID: {}'.format(best_match, best_match_id) )

Best Match 55.06359959174482 Best Match ID: 737


In [28]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    cos_similarity = compute_cos_similarity(my_vector, user_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match {} Best Match ID: {}'.format(best_match, best_match_id) )

Best Match 0.569065731527988 Best Match ID: 915


## Collaborative Filtering

* 사용자와 항목의 유사성을 동시에 고려해 추천
* 기존에 내 관심사가 아닌 항목이라도 추천 가능
* 자동으로 임베딩 학습 가능


* 장점
  * 자동으로 임베딩을 학습하기 때문에 도메인 지식이 필요 없다.
  * 기존의 관심사가 아니더라도 추천 가능
* 단점
  * 학습 과정에 나오지 않은 항목은 임베딩을 만들 수 없음
  * 추가 특성을 사용하기 어려움

In [29]:
from surprise import KNNBasic, SVD, SVDpp, NMF
from surprise import Dataset
from surprise.model_selection import cross_validate

In [30]:
data = Dataset.load_builtin('ml-100k', prompt=False)

### KNN

In [32]:
model = KNNBasic()
cross_validate(model, data, measures=['rmse', 'mse'], cv=5, n_jobs=4, verbose=True)

Evaluating RMSE, MSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9853  0.9780  0.9859  0.9743  0.9677  0.9782  0.0068  
MSE (testset)     0.9707  0.9565  0.9720  0.9493  0.9365  0.9570  0.0134  
Fit time          1.02    1.41    1.26    1.13    0.49    1.06    0.31    
Test time         6.13    5.96    5.71    5.71    2.09    5.12    1.52    


{'test_rmse': array([0.98525791, 0.97800653, 0.98590566, 0.97430555, 0.96774988]),
 'test_mse': array([0.97073315, 0.95649676, 0.97200997, 0.94927131, 0.93653983]),
 'fit_time': (1.015627384185791,
  1.4098691940307617,
  1.2640724182128906,
  1.133850336074829,
  0.49390554428100586),
 'test_time': (6.128860950469971,
  5.959172010421753,
  5.708286762237549,
  5.7149412631988525,
  2.0886449813842773)}

### SVD

In [33]:
model = SVD()
cross_validate(model, data, measures=['rmse', 'mse'], cv=5, n_jobs=4, verbose=True)

Evaluating RMSE, MSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9320  0.9355  0.9443  0.9353  0.9421  0.9378  0.0046  
MSE (testset)     0.8686  0.8753  0.8917  0.8747  0.8876  0.8796  0.0087  
Fit time          1.65    1.65    1.73    1.65    0.79    1.49    0.35    
Test time         0.34    0.36    0.36    0.22    0.11    0.28    0.10    


{'test_rmse': array([0.93197887, 0.93554919, 0.94428066, 0.93525555, 0.9421485 ]),
 'test_mse': array([0.86858462, 0.87525229, 0.89166596, 0.87470294, 0.88764379]),
 'fit_time': (1.6458592414855957,
  1.646146297454834,
  1.732095718383789,
  1.6477546691894531,
  0.7913486957550049),
 'test_time': (0.34188127517700195,
  0.36320996284484863,
  0.36484479904174805,
  0.22270607948303223,
  0.10601449012756348)}

### NMF

In [34]:
model = NMF()
cross_validate(model, data, measures=['rmse', 'mse'], cv=5, n_jobs=4, verbose=True)

Evaluating RMSE, MSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9602  0.9632  0.9594  0.9673  0.9572  0.9615  0.0035  
MSE (testset)     0.9220  0.9278  0.9204  0.9357  0.9163  0.9244  0.0067  
Fit time          3.36    3.27    3.26    2.97    0.93    2.76    0.92    
Test time         0.32    0.36    0.32    0.15    0.09    0.25    0.11    


{'test_rmse': array([0.96018752, 0.96321988, 0.9593961 , 0.967328  , 0.95723458]),
 'test_mse': array([0.92196007, 0.92779254, 0.92044088, 0.93572346, 0.91629804]),
 'fit_time': (3.3559927940368652,
  3.265364646911621,
  3.2565414905548096,
  2.968844175338745,
  0.9329798221588135),
 'test_time': (0.3189373016357422,
  0.35941362380981445,
  0.31935977935791016,
  0.15336823463439941,
  0.09163022041320801)}

### SVD++

In [35]:
model = SVDpp()
cross_validate(model, data, measures=['rmse', 'mse'], cv=5, n_jobs=4, verbose=True)

Evaluating RMSE, MSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9242  0.9163  0.9196  0.9253  0.9194  0.9210  0.0033  
MSE (testset)     0.8541  0.8395  0.8457  0.8563  0.8453  0.8482  0.0062  
Fit time          69.34   68.25   68.38   69.36   24.04   59.87   17.92   
Test time         9.43    9.72    9.57    9.03    2.73    8.10    2.69    


{'test_rmse': array([0.92416633, 0.91625107, 0.91964061, 0.92533873, 0.91939   ]),
 'test_mse': array([0.8540834 , 0.83951601, 0.84573886, 0.85625177, 0.84527797]),
 'fit_time': (69.33566927909851,
  68.24601697921753,
  68.3774802684784,
  69.36012315750122,
  24.038317441940308),
 'test_time': (9.431932926177979,
  9.718948602676392,
  9.565923929214478,
  9.031244993209839,
  2.7349205017089844)}

## Hybrid

* 컨텐츠 기반 필터링과 협업 필터링을 조합한 방식

In [37]:
import numpy as np
from sklearn.decomposition import randomized_svd, non_negative_factorization
from surprise import Dataset

In [50]:
data = Dataset.load_builtin('ml-100k', prompt=False)
raw_data = np.array(data.raw_ratings, dtype=int)

raw_data[:, 0] -= 1
raw_data[:, 1] -= 1

In [51]:
n_users = np.max(raw_data[:, 0])
n_movies = np.max(raw_data[:, 1])
shape = (n_users + 1, n_movies + 1)
shape

(943, 1682)

In [52]:
adj_matrix = np.ndarray(shape, dtype = int)
for user_id, movie_id, rating, time in raw_data:
  adj_matrix[user_id][movie_id] = rating

In [53]:
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [54]:
# U: user, V: item, S: 특이값 벡터
U, S, V = randomized_svd(adj_matrix, n_components=2)
S = np.diag(S)

In [55]:
print(U.shape)
print(S.shape) # 잠재 요인을 찾을 수 있는 특이값 벡터
print(V.shape)

(943, 2)
(2, 2)
(2, 1682)


In [56]:
np.matmul(np.matmul(U, S), V)

array([[ 3.91732670e+00,  1.47276646e+00,  7.98262064e-01, ...,
         6.24907690e-04,  1.41100864e-02,  1.36545893e-02],
       [ 1.85777237e+00,  3.96191047e-01,  5.05705380e-01, ...,
         5.38862953e-03,  1.77236844e-03,  5.26954585e-04],
       [ 8.94989245e-01,  1.71578400e-01,  2.51738376e-01, ...,
         2.92094740e-03,  5.39931691e-04, -1.25740408e-04],
       ...,
       [ 9.92051753e-01,  2.10814867e-01,  2.70363087e-01, ...,
         2.89019151e-03,  9.34216535e-04,  2.66605547e-04],
       [ 1.30425376e+00,  5.27669945e-01,  2.50080154e-01, ...,
        -4.20678881e-04,  5.30525896e-03,  5.28070235e-03],
       [ 2.82999402e+00,  9.70812219e-01,  6.15871617e-01, ...,
         2.02091498e-03,  8.67740559e-03,  8.03107570e-03]])

- 사용자 기반 추천
- 나와 비슷한 취향을 가진 다른 사용자의 행동 추천
- 사용자 특징 벡터의 유사도 사용

In [57]:
my_id, my_vector = 0, U[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(U):
  if my_id != user_id:
    cos_similarity = compute_cos_similarity(my_vector, user_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match {} Best Match ID: {}'.format(best_match, best_match_id) )

Best Match 0.9999942289905208 Best Match ID: 235


In [59]:
recommend_list = []
for i, log in enumerate(zip(adj_matrix[my_id], adj_matrix[best_match_id])):
  log1, log2 = log
  if log1 < 1. and log2 > 0.:
    recommend_list.append(i)

print(recommend_list)

[272, 273, 274, 281, 285, 288, 293, 297, 303, 306, 312, 317, 327, 332, 369, 410, 418, 419, 422, 426, 428, 431, 434, 442, 461, 475, 477, 482, 495, 503, 504, 505, 506, 509, 519, 520, 522, 525, 531, 545, 548, 590, 594, 595, 613, 631, 654, 658, 660, 672, 684, 685, 691, 695, 698, 704, 716, 728, 734, 749, 755, 863, 865, 933, 1012, 1038, 1101, 1327, 1400]


* 항목 기반 추천
* 내가 본 항목과 비슷한 항목을 추천
* 항목 특징 벡터의 유사도 사용

In [71]:
my_id, my_vector = 0, V.T[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(V.T):
  if my_id != user_id:
    cos_similarity = compute_cos_similarity(my_vector, user_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match {} Best Match ID: {}'.format(best_match, best_match_id) )

Best Match 0.9999999949737673 Best Match ID: 1287


In [72]:
recommend_list = []
for i, user_vectgor in enumerate(adj_matrix):
  if adj_matrix[i][my_id] > 0.9:
    recommend_list.append(i)

print(recommend_list)

[0, 1, 4, 5, 9, 12, 14, 15, 16, 17, 19, 20, 22, 24, 25, 37, 40, 41, 42, 43, 44, 48, 53, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 69, 71, 72, 74, 76, 78, 80, 81, 82, 83, 88, 91, 92, 93, 94, 95, 96, 98, 100, 101, 105, 107, 108, 116, 119, 120, 123, 124, 127, 129, 130, 133, 136, 137, 140, 143, 144, 147, 149, 150, 156, 157, 159, 161, 167, 173, 176, 177, 180, 181, 183, 188, 192, 193, 197, 198, 199, 200, 201, 202, 203, 208, 209, 212, 215, 221, 222, 229, 230, 231, 233, 234, 241, 242, 243, 245, 246, 247, 248, 249, 250, 251, 252, 253, 255, 261, 262, 264, 267, 270, 273, 274, 275, 276, 278, 279, 285, 286, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 302, 304, 306, 307, 310, 311, 312, 313, 319, 321, 323, 324, 325, 326, 329, 330, 331, 335, 337, 338, 339, 342, 343, 344, 346, 347, 349, 356, 358, 359, 362, 364, 370, 373, 377, 378, 379, 380, 386, 387, 388, 389, 392, 393, 394, 395, 397, 398, 400, 401, 402, 405, 406, 410, 411, 415, 416, 418, 421, 423, 424, 428, 431, 433, 434, 437, 440, 444, 

- 비음수 행렬 분해를 사용한 하이브리드 추천

In [73]:
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [74]:
A, B, iter = non_negative_factorization(adj_matrix, n_components=2)

In [75]:
np.matmul(A, B)

array([[3.71107433e+00, 1.48461856e+00, 7.39541570e-01, ...,
        3.64501983e-03, 1.45513751e-02, 1.44116215e-02],
       [2.11729713e+00, 2.37145679e-01, 5.51637757e-01, ...,
        4.76290749e-03, 2.84605930e-05, 0.00000000e+00],
       [9.85325089e-01, 1.10360320e-01, 2.56715279e-01, ...,
        2.21651094e-03, 1.32446863e-05, 0.00000000e+00],
       ...,
       [1.04478344e+00, 1.17019891e-01, 2.72206478e-01, ...,
        2.35026384e-03, 1.40439223e-05, 0.00000000e+00],
       [1.45769331e+00, 5.42108391e-01, 2.99217251e-01, ...,
        1.61232500e-03, 5.15892655e-03, 5.10748255e-03],
       [2.44709957e+00, 9.41278705e-01, 4.95671746e-01, ...,
        2.56934867e-03, 9.08400301e-03, 8.99501717e-03]])

- 사용자 기반 추천

In [76]:
my_id, my_vector = 0, U[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(U):
  if my_id != user_id:
    cos_similarity = compute_cos_similarity(my_vector, user_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match {} Best Match ID: {}'.format(best_match, best_match_id) )

Best Match 0.9999942289905208 Best Match ID: 235


In [77]:
recommend_list = []
for i, log in enumerate(zip(adj_matrix[my_id], adj_matrix[best_match_id])):
  log1, log2 = log
  if log1 < 1. and log2 > 0.:
    recommend_list.append(i)

print(recommend_list)

[272, 273, 274, 281, 285, 288, 293, 297, 303, 306, 312, 317, 327, 332, 369, 410, 418, 419, 422, 426, 428, 431, 434, 442, 461, 475, 477, 482, 495, 503, 504, 505, 506, 509, 519, 520, 522, 525, 531, 545, 548, 590, 594, 595, 613, 631, 654, 658, 660, 672, 684, 685, 691, 695, 698, 704, 716, 728, 734, 749, 755, 863, 865, 933, 1012, 1038, 1101, 1327, 1400]


- 항목 기반 추천

In [78]:
my_id, my_vector = 0, V.T[0]
best_match, best_match_id, best_match_vector = -1, -1, []

for user_id, user_vector in enumerate(V.T):
  if my_id != user_id:
    cos_similarity = compute_cos_similarity(my_vector, user_vector)
    if cos_similarity > best_match:
      best_match = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector

print('Best Match {} Best Match ID: {}'.format(best_match, best_match_id) )

Best Match 0.9999999949737673 Best Match ID: 1287


In [79]:
recommend_list = []
for i, user_vectgor in enumerate(adj_matrix):
  if adj_matrix[i][my_id] > 0.9:
    recommend_list.append(i)

print(recommend_list)

[0, 1, 4, 5, 9, 12, 14, 15, 16, 17, 19, 20, 22, 24, 25, 37, 40, 41, 42, 43, 44, 48, 53, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 69, 71, 72, 74, 76, 78, 80, 81, 82, 83, 88, 91, 92, 93, 94, 95, 96, 98, 100, 101, 105, 107, 108, 116, 119, 120, 123, 124, 127, 129, 130, 133, 136, 137, 140, 143, 144, 147, 149, 150, 156, 157, 159, 161, 167, 173, 176, 177, 180, 181, 183, 188, 192, 193, 197, 198, 199, 200, 201, 202, 203, 208, 209, 212, 215, 221, 222, 229, 230, 231, 233, 234, 241, 242, 243, 245, 246, 247, 248, 249, 250, 251, 252, 253, 255, 261, 262, 264, 267, 270, 273, 274, 275, 276, 278, 279, 285, 286, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 302, 304, 306, 307, 310, 311, 312, 313, 319, 321, 323, 324, 325, 326, 329, 330, 331, 335, 337, 338, 339, 342, 343, 344, 346, 347, 349, 356, 358, 359, 362, 364, 370, 373, 377, 378, 379, 380, 386, 387, 388, 389, 392, 393, 394, 395, 397, 398, 400, 401, 402, 405, 406, 410, 411, 415, 416, 418, 421, 423, 424, 428, 431, 433, 434, 437, 440, 444, 