In [22]:
import os
import pandas as pd

import numpy as np
from sklearn.model_selection import train_test_split
# https://drive.google.com/drive/folders/19gkcIYjA3EjoNrMp9mn8KnZutKoPYLmg

In [23]:
base_src = './movie_data/'
u_user_src = os.path.join(base_src,'u.user')
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv(u_user_src,
                    sep = '|', names = u_cols, encoding= 'latin-1')
users = users.set_index('user_id')

In [24]:
u_item_src = os.path.join(base_src, 'u.item')
i_cols = ['movie_id','title','release date','video release date','IMDB URL','unknown','Action','Adventure','Animation','Childrens', 'Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movies=pd.read_csv(u_item_src, sep='|',names =i_cols,
                   encoding= 'latin-1')
movies = movies.set_index('movie_id')

In [25]:
u_data_src = os.path.join(base_src,'u.data')
r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(u_data_src, sep='\t', names = r_cols, encoding = 'latin-1')

In [26]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))
# 유사집단의 크기를 미리 정하기 위해서 기존 score 함수에 neighbor_size 인자값 추가
def score(model,neighbor_size=0):
    # 테스트 데이터의 user_id와 movie_id 간 pair를 맞춰 튜플형원소 리스트데이터를 만듬
    id_pairs = zip(x_test['user_id'],x_test['movie_id'])
    # 모든 사용자-영화 짝에 대해서 주어진 예측모델에 의해 예측값 계산 및 리스트형 데이터 생성
    y_pred = np.array([model(user,movie,neighbor_size) for (user,movie) in id_pairs])
    # 실제 평점값
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

In [27]:
x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, stratify = y)
ratings_matrix = x_train.pivot(index = 'user_id', columns = 'movie_id',values = 'rating')

In [38]:
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                               index=ratings_matrix.index, columns=ratings_matrix.index)
def CF_simple(user_id, movie_id,neighbor_size=None):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating
def CF_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        else:
            if len(sim_scores) > 1:
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating  = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
    return mean_rating
def recommend_movie(user_id, n_items, neighbor_size=30):
    user_movie = ratings_matrix.loc[user_id].copy()
    for movie in ratings_matrix.columns:
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0
        else:
            user_movie.loc[movie] = CF_knn(user_id,movie, neighbor_size)
    movie_sort = user_movie.sort_values(ascending =False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

In [34]:
score(CF_simple)

1.0284846111834214

In [35]:
score(CF_knn, neighbor_size=10)

1.0336276044711281

In [39]:
ratings_matrix = ratings.pivot(index='user_id', columns='movie_id', values='rating')

matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                               index=ratings_matrix.index, columns=ratings_matrix.index)

In [42]:
recommend_movie(user_id = 729, n_items=5, neighbor_size=30)

movie_id
1189                      Prefontaine (1997)
1293                         Star Kid (1997)
1467    Saint of Fort Washington, The (1993)
1500               Santa with Muscles (1996)
22                         Braveheart (1995)
Name: title, dtype: object

In [43]:
# 사용자 평가 경향을 고려한 함수 #
# full matrix 에서 사용자의 평점 평균을 구한다.
rating_mean = ratings_matrix.mean(axis=1)
# 영화 평점과 각 사용자의평균과의 차이 (평점편차)를 구한다.
rating_bias = (ratings_matrix.T - rating_mean).T
# 사용자 평가 경향을 고려한 함수
def CF_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_bias[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx) # CF_knn에서는 dropna() 사용했었음
        sim_scores = sim_scores.drop(none_rating_idx)
        if neighbor_size==0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    return prediction

score(CF_knn_bias, 30)

NameError: name 'rating_matrix' is not defined

In [44]:
rating_matrix_t = np.transpose(ratings_matrix)
matrix_dummy = rating_matrix_t.copy().fillna(0)
item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity, index = rating_matrix_t.index, columns =rating_matrix_t.index )

In [46]:
def CF_IBCF(user_id, movie_id):
    if movie_id in item_similarity.columns:
        sim_scores = item_similarity[movie_id]
        user_rating = rating_matrix_t[user_id]
        none_rating_idx = user_rating[user_rating.isnull()].index
        user_rating = user_rating.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating
score(CF_IBCF)

TypeError: CF_IBCF() takes 2 positional arguments but 3 were given