# 컨텐츠 기반 필터링(Content-based filtering)

In [1]:
import numpy as np
import pandas as pd

In [2]:
from surprise import Dataset
data = Dataset.load_builtin('ml-100k', prompt=False)
df = pd.DataFrame(data.raw_ratings, columns=['user-id', 'movie-id', 'rating', 'timestamp']) # 사용자 아이디, 영화 목록, 평점
df.head()

Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to C:\Users\user/.surprise_data/ml-100k


Unnamed: 0,user-id,movie-id,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [3]:
df.shape

(100000, 4)

### Adjacent Matrix 생성
- 행: user-id 
- 컬럼: movie-id
- 내용: rating 

In [5]:
raw_data = np.array(data.raw_ratings, dtype=int)
raw_data[:, :2] -= 1        # user-id, movie-id가 0부터 시작하도록
raw_data[:5]

array([[      195,       241,         3, 881250949],
       [      185,       301,         3, 891717742],
       [       21,       376,         1, 878887116],
       [      243,        50,         2, 880606923],
       [      165,       345,         1, 886397596]])

In [17]:
n_users = df['user-id'].nunique()               
n_movies = df['movie-id'].astype(int).max()
n_users, n_movies

(943, 1682)

### adj_matrix: 
- 0/1로만 만드는 경우
- 평점 점수를 주는 경우

In [18]:
# 시청한 영화는 1, 미시청한 영화는 0
adj_matrix = np.zeros((n_users, n_movies), int)     # dtype: int # 0으로 채워진 행렬
for user_id, movie_id, _, _ in raw_data:
    adj_matrix[user_id, movie_id] = 1
adj_matrix[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

In [19]:
# 유사도 - 이진 벡터의 내적
my_id, my_vector = 0, adj_matrix[0]     # index 0번 자료로 설정

np.dot(my_vector, adj_matrix[1]), np.dot(my_vector, adj_matrix[2])

(18, 8)

In [20]:
# 누가 나랑 가장 닯았나?
best_score, best_match_id = -1, -1

for i in range(1,len(adj_matrix)):
    dot = np.dot(my_vector, adj_matrix[i])
    if dot > best_score:
        best_score = dot
        best_match_id = i

best_score, best_match_id   # best_match_id: index 0 - '나'와 가장 닮은 유저

(183, 275)