In [3]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt # https://matplotlib.org/stable/api/pyplot_summary.html#module-matplotlib.pyplot

%matplotlib inline

# from IPython.display import set_matplotlib_formats
# set_matplotlib_formats("retina") 
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings('ignore')

print(f"{np.__version__}")
print(f"{pd.__version__}")
print(f"{mpl.__version__}")

1.23.5
1.5.3
3.7.1


In [4]:
import scipy as sp
import sympy
import sklearn

sympy.init_printing(use_latex='mathjax') # Juypter 노트북에서 수학식의 LaTeX 표현을 위해 필요함

print(f"{sklearn.__version__}")
print(f"{sympy.__version__}")
print(f"{sp.__version__}")

1.1.3
1.11.1
1.10.0


In [5]:
import os

base_src = "./data/"

# users
u_user_src = os.path.join(base_src, "u.user")
u_cols = ["user_id", "age", "sex", "occupation", "zip_code"]
users = pd.read_csv(u_user_src, sep="|", names=u_cols, encoding="latin-1")
users.set_index("user_id", inplace=True)

# items
u_item_src = os.path.join(base_src, "u.item")
i_cols = [
    "movie_id", "title", "release_date", "video_release_date", "imdb_url", 
    "unknown", "Action", "Adventure", "Animation", "Children", "Comedy", "Crime",
    "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-fi", "Thriller", "War", "Western"
    ]
items = pd.read_csv(u_item_src, sep="|", names=i_cols, encoding="latin-1")
items.set_index("movie_id", inplace=True)

# ratings
u_data_src = os.path.join(base_src, "u.data")
u_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(u_data_src, sep="\t", names=u_cols, encoding="latin-1")
ratings.set_index("user_id", inplace=True)

users.shape, items.shape, ratings.shape

((943, 4), (1682, 23), (100000, 3))

In [49]:
def recom_movie(n_items: int) -> pd.Series:
    """
    단순한 rating 평균으로 top-k 추천
    """
    movie_mean = ratings.groupby(["movie_id"])["rating"].mean() # movie 별로 평균을 냄. movie_id를 index로, rating의 mean으로 value를 가짐
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items] # 평균을 기준으로 정렬
    
    # 전체 아이템 중 n_items개의 아이템을 추출
    recom_movies = items.loc[movie_sort.index] # movie_id를 기준으로 items에서 추출
    recommendations = recom_movies["title"]
    
    return recommendations

In [51]:
recom_movie(12)

movie_id
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1122                       They Made Me a Criminal (1939)
1653    Entertaining Angels: The Dorothy Day Story (1996)
1293                                      Star Kid (1997)
1500                            Santa with Muscles (1996)
1189                                   Prefontaine (1997)
1536                                 Aiqing wansui (1994)
1467                 Saint of Fort Washington, The (1993)
1449                               Pather Panchali (1955)
119                Maya Lin: A Strong Clear Vision (1994)
Name: title, dtype: object

In [52]:
def RMSE(y_true, y_pred):
    """
    root mean squared error.
    """
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

In [55]:
rmse = []

movie_mean = ratings.groupby(["movie_id"])["rating"].mean() # Series

for user in set(ratings.index):
    y_true = ratings.loc[user]["rating"]
    y_pred = movie_mean[ratings.loc[user]["movie_id"]]
    
    rmse.append(RMSE(y_true, y_pred))
    
np.mean(rmse) # 0점에서 10점 척도인데 RMSE가 1.

0.996007224010567