In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt # https://matplotlib.org/stable/api/pyplot_summary.html#module-matplotlib.pyplot

%matplotlib inline

# from IPython.display import set_matplotlib_formats
# set_matplotlib_formats("retina") 
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings('ignore')

print(f"{np.__version__}")
print(f"{pd.__version__}")
print(f"{mpl.__version__}")

1.23.5
1.5.3
3.7.1


In [2]:
import scipy as sp
import sympy
import sklearn

sympy.init_printing(use_latex='mathjax') # Juypter 노트북에서 수학식의 LaTeX 표현을 위해 필요함

print(f"{sklearn.__version__}")
print(f"{sympy.__version__}")
print(f"{sp.__version__}")

1.1.3
1.11.1
1.10.0


In [3]:
import os

base_src = "./data/"

# users
u_user_src = os.path.join(base_src, "u.user")
u_cols = ["user_id", "age", "sex", "occupation", "zip_code"]
users = pd.read_csv(u_user_src, sep="|", names=u_cols, encoding="latin-1")
users = users.set_index("user_id")

# items
u_item_src = os.path.join(base_src, "u.item")
i_cols = [
    "movie_id", "title", "release_date", "video_release_date", "imdb_url", 
    "unknown", "Action", "Adventure", "Animation", "Children", "Comedy", "Crime",
    "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-fi", "Thriller", "War", "Western"
    ]
items = pd.read_csv(u_item_src, sep="|", names=i_cols, encoding="latin-1")

# ratings
u_data_src = os.path.join(base_src, "u.data")
u_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(u_data_src, sep="\t", names=u_cols, encoding="latin-1")

users.shape, items.shape, ratings.shape

((943, 4), (1682, 24), (100000, 4))

In [4]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

def score(model):
    """
    RMSE(loss)를 반환하므로 작아야 좋은 것임
    """
    id_pairs:tuple[int, int] = zip(x_test["user_id"], x_test["movie_id"])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test["rating"])
    return RMSE(y_true, y_pred)

In [5]:
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings["rating"]

x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=0.26, 
                                                    stratify=y)

ratings_matrix = x_train.pivot(
    index="user_id", 
    columns="movie_id", 
    values="rating")

ratings_matrix.head(1)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1672,1673,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,4.0,3.0,,5.0,4.0,1.0,5.0,,...,,,,,,,,,,


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = ratings_matrix.copy().fillna(0)

# u-u sim matrix
user_similarity = pd.DataFrame(cosine_similarity(matrix_dummy),
                            index=ratings_matrix.index,
                            columns=ratings_matrix.index)

# user id 1번째 유저와 가장 비슷한 유저의 유사도와 index
user_similarity.loc[1, 2:].max(), user_similarity.loc[1, 2:].idxmax()

(0.43920055355556414, 864)

In [7]:

def CF_simple(user_id, movie_id):
    """
    u-u matrix를 가중치로 간주하여 특정 아이템(movie)에 대한 타 유저들의 평점을 가중 평균하여
    특정 유저(user)의 아이템에 대한 예측 평점을 계산
    """
    
    # 찾고자 하는 영화나 u-i matrix에는 있어야 함
    if movie_id in ratings_matrix:
        
        sim_scores = user_similarity[user_id].copy() # 주어진 유저의 유사도
        movie_ratings = ratings_matrix[movie_id].copy() # 주어진 영화의 평점
        
        # 타 유저가 보지 않아 평점이 없는 경우를 제외.
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        
        # 평점 가중 평균
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    
    else:
        mean_rating = 3.0 # 없으면 걍 3점으로 예측
    return mean_rating

In [8]:
score(CF_simple)

1.019149753982244