In [2]:

import pandas as pd
import numpy as np
import warnings
# 경고 제거
warnings.filterwarnings("ignore")


In [2]:
from google.colab import files
files.upload();

Saving movies.csv to movies.csv
Saving ratings.csv to ratings.csv
Saving users.csv to users.csv


In [3]:
ratings = pd.read_csv('ratings.csv')

In [4]:
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [4]:
ratings = ratings.drop(columns = ['timestamp'])
ratings.columns = ['user_id', 'movie_id', 'rating']
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [5]:
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y, random_state=12)

# "유저별 영화 평점"을 알아보기 위해 pivot table 활용, 결측치 처리
# train 셋을 full matrix로 변환
rating_matrix = x_train.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id')
rating_matrix = rating_matrix.fillna(0)

# 유저들의 영화 평점 데이터프레임의 코사인유사도를 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index = rating_matrix.index, columns = rating_matrix.index)


In [22]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

def score_2(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id']) # 테스트로 해야되나?
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)


In [30]:
id_pairs = set(zip(x_test['user_id'], x_test['movie_id']))
print(id_pairs)

In [19]:
# CF + KNN
def cf_knn(user_id, movie_id, neighbor_size=10):
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id]
        this_movie_ratings = rating_matrix[movie_id]
        none_rating_idx = rating_matrix[movie_id][rating_matrix[movie_id].isnull()].index
        this_movie_ratings = this_movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)

        if neighbor_size == 0:  
            mean_rating = np.dot(sim_scores, this_movie_ratings) / sim_scores.sum()

        else:                   
            # 지정된 neighbor_size와 해당 영화를 평가한 총 사용자수 중 작은 것으로 neighbor_size 결정
            neighbor_size = min( neighbor_size, len(sim_scores) )

            sim_scores = np.array(sim_scores)
            movie_ratings = np.array(this_movie_ratings)
            user_idx = np.argsort(sim_scores)
            sim_scores = sim_scores[user_idx][-neighbor_size:]
            movie_ratings = movie_ratings[user_idx][-neighbor_size:]
            mean_rating = np.dot( sim_scores, movie_ratings ) / sim_scores.sum()
    else:

        mean_rating = 3.0
        
    return mean_rating

In [20]:
score_2(cf_knn, 10)

2.7691991662892623

In [23]:
  for k in (10,20,30,40,50):
    print('k = ', k,'RMSE = ', score_2(cf_knn, k))

k =  10 RMSE =  2.7691991662892623
k =  20 RMSE =  2.6749324543025548
k =  30 RMSE =  2.647567828361757
k =  40 RMSE =  2.6441782965743035
k =  50 RMSE =  2.6459468484794133
