In [35]:
import pandas as pd

df = pd.read_csv("./data/u.data", sep='\t', header=None)
df.columns = ["user_id", "item_id", "rating", "timestamp"]
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [36]:
df.shape

(100000, 4)

In [37]:
# 데이터 탐색

df.groupby(["rating"])[["user_id"]].count()

Unnamed: 0_level_0,user_id
rating,Unnamed: 1_level_1
1,6110
2,11370
3,27145
4,34174
5,21201


In [38]:
df.groupby(["item_id"])[["user_id"]].count().head()

Unnamed: 0_level_0,user_id
item_id,Unnamed: 1_level_1
1,452
2,131
3,90
4,209
5,86


In [39]:
n_users = df.user_id.unique().shape[0] # unique한 user의 수
n_items = df.item_id.unique().shape[0] # unique한 item의 수

n_users, n_items

(943, 1682)

In [40]:
import numpy as np

ratings = np.zeros((n_users, n_items)) # 0으로 초기화된 n_users X n_items matrics
ratings.shape

(943, 1682)

In [41]:
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
    
type(ratings)

numpy.ndarray

In [42]:
ratings.shape

(943, 1682)

In [43]:
ratings

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [44]:
# train data와 test data 분리

from sklearn.model_selection import train_test_split

ratings_train, ratings_test = train_test_split(ratings, test_size=0.33, random_state=42)
ratings_train.shape, ratings_test.shape

((631, 1682), (312, 1682))

In [45]:
# 사용자 기반 협업 필터링

from sklearn.metrics.pairwise import cosine_distances

cosine_distances(ratings_train) # 사용자 간 코사인 유사도 행렬

array([[0.        , 0.63524236, 0.55753769, ..., 0.97989359, 0.66892071,
        0.74361482],
       [0.63524236, 0.        , 0.57364745, ..., 0.93305581, 0.72660686,
        0.77662732],
       [0.55753769, 0.57364745, 0.        , ..., 0.93324244, 0.74575627,
        0.77679874],
       ...,
       [0.97989359, 0.93305581, 0.93324244, ..., 0.        , 0.95146572,
        0.94857492],
       [0.66892071, 0.72660686, 0.74575627, ..., 0.95146572, 0.        ,
        0.8801978 ],
       [0.74361482, 0.77662732, 0.77679874, ..., 0.94857492, 0.8801978 ,
        0.        ]])

In [46]:
distances = 1 - cosine_distances(ratings_train)
distances

array([[1.        , 0.36475764, 0.44246231, ..., 0.02010641, 0.33107929,
        0.25638518],
       [0.36475764, 1.        , 0.42635255, ..., 0.06694419, 0.27339314,
        0.22337268],
       [0.44246231, 0.42635255, 1.        , ..., 0.06675756, 0.25424373,
        0.22320126],
       ...,
       [0.02010641, 0.06694419, 0.06675756, ..., 1.        , 0.04853428,
        0.05142508],
       [0.33107929, 0.27339314, 0.25424373, ..., 0.04853428, 1.        ,
        0.1198022 ],
       [0.25638518, 0.22337268, 0.22320126, ..., 0.05142508, 0.1198022 ,
        1.        ]])

In [47]:
distances.shape # 정방행렬

(631, 631)

In [48]:
# 평가 예측
user_pred = distances.dot(ratings_train) / np.array([np.abs(distances).sum(axis=1)]).T

In [49]:
# 모델 성능 측정
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [50]:
np.sqrt(get_mse(user_pred, ratings_train)) # train data

2.8075245308903365

In [51]:
np.sqrt(get_mse(user_pred, ratings_test)) # test data

2.9870546415652575

In [52]:
# 가장 비슷한 k명을 찾는 비지도 방식의 이웃 검색

from sklearn.neighbors import NearestNeighbors

k = 5
neigh = NearestNeighbors(n_neighbors=k, metric="cosine")

In [53]:
neigh.fit(ratings_train)

NearestNeighbors(metric='cosine')

In [54]:
top_k_distances, top_k_users = neigh.kneighbors(ratings_train, return_distance=True)

In [55]:
top_k_distances.shape, top_k_users.shape

((631, 5), (631, 5))

In [56]:
top_k_users

array([[  0, 589, 155,  33, 364],
       [  1, 483, 339, 172, 188],
       [  2, 382, 560, 350, 155],
       ...,
       [628, 258, 242, 229, 494],
       [629, 378, 155, 589, 591],
       [630, 495, 201, 417, 603]], dtype=int64)

In [57]:
top_k_distances

array([[0.        , 0.38230161, 0.39990633, 0.40834169, 0.4100445 ],
       [0.        , 0.4625691 , 0.50677921, 0.50811827, 0.50882566],
       [0.        , 0.46538829, 0.48267976, 0.49176259, 0.49265099],
       ...,
       [0.        , 0.5764934 , 0.59340849, 0.64699606, 0.66472075],
       [0.        , 0.60496802, 0.6115226 , 0.62054374, 0.6229481 ],
       [0.        , 0.56320216, 0.60221688, 0.60314589, 0.6400121 ]])

In [64]:
# 선택된 k명의 사용자들의 평가 가중치 합을 사용한 예측 및 모델의 성능 측정

user_pred_k = np.zeros(ratings_train.shape)

for i in range(ratings_train.shape[0]):
    user_pred_k[i, :] = top_k_distances[i].T.dot(ratings_train[top_k_users][i]) / np.array([np.abs(top_k_distances[i].T).sum(axis=0)]).T

In [65]:
user_pred_k.shape

(631, 1682)

In [66]:
user_pred_k

array([[4.25618269, 2.49082621, 0.71654943, ..., 0.        , 0.        ,
        0.        ],
       [3.74418756, 0.        , 2.48873124, ..., 0.        , 0.        ,
        0.        ],
       [3.22293592, 2.98635211, 2.47648118, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.07143091, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [3.73945823, 2.48622549, 1.76969702, ..., 0.        , 0.        ,
        0.        ],
       [1.95357502, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [67]:
# 모델 평가
np.sqrt(get_mse(user_pred_k, ratings_train))

2.0922014531938316

In [68]:
np.sqrt(get_mse(user_pred_k, ratings_test))

3.054698791142718