In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt # https://matplotlib.org/stable/api/pyplot_summary.html#module-matplotlib.pyplot

%matplotlib inline

# from IPython.display import set_matplotlib_formats
# set_matplotlib_formats("retina") 
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings('ignore')

print(f"{np.__version__}")
print(f"{pd.__version__}")
print(f"{mpl.__version__}")

1.23.5
1.5.3
3.7.1


In [2]:
import scipy as sp
import sympy
import sklearn

sympy.init_printing(use_latex='mathjax') # Juypter 노트북에서 수학식의 LaTeX 표현을 위해 필요함

print(f"{sklearn.__version__}")
print(f"{sympy.__version__}")
print(f"{sp.__version__}")

1.1.3
1.11.1
1.10.0


In [6]:
import os

base_src = "./data/"

# users
u_user_src = os.path.join(base_src, "u.user")
u_cols = ["user_id", "age", "sex", "occupation", "zip_code"]
users = pd.read_csv(u_user_src, sep="|", names=u_cols, encoding="latin-1")
users = users.set_index("user_id")

# items
u_item_src = os.path.join(base_src, "u.item")
i_cols = [
    "movie_id", "title", "release_date", "video_release_date", "imdb_url", 
    "unknown", "Action", "Adventure", "Animation", "Children", "Comedy", "Crime",
    "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-fi", "Thriller", "War", "Western"
    ]
items = pd.read_csv(u_item_src, sep="|", names=i_cols, encoding="latin-1")
items.set_index("movie_id")

# ratings
u_data_src = os.path.join(base_src, "u.data")
u_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(u_data_src, sep="\t", names=u_cols, encoding="latin-1")

users.shape, items.shape, ratings.shape

((943, 4), (1682, 24), (100000, 4))

In [7]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

def score(model, neighbor_size=0):
    """
    이 섹션에서는 neighbor를 고려해야 하므로 여기서 모델에 주입해서 사용하기로 함.
    """
    id_pairs:tuple[int, int] = zip(x_test["user_id"], x_test["movie_id"])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test["rating"])
    return RMSE(y_true, y_pred)

In [13]:
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings["rating"]

x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=0.26, 
                                                    stratify=y)

ratings_matrix = x_train.pivot(
    index="user_id",
    columns="movie_id",
    values="rating"
)

ratings_matrix.head(1)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1670,1671,1672,1674,1675,1676,1678,1679,1680
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,


In [15]:
from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy) # u-u sim
user_similarity = pd.DataFrame(data=user_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)

user_similarity.head(1)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.150902,0.04837,0.056544,0.282698,0.330634,0.307801,0.262602,0.076983,0.217574,...,0.235556,0.07782,0.220987,0.118935,0.119576,0.109512,0.245605,0.078027,0.130232,0.283038


In [22]:
def CF_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in ratings_matrix.columns:
        sim_score = user_similarity[user_id].copy() # 유사도 점수
        movie_ratings = ratings_matrix[movie_id].copy() # 영화 평점
        
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index # 평점이 없는 사람들의 인덱스
        movie_ratings = movie_ratings.dropna()
        sim_score = sim_score.drop(none_rating_idx)
        
        if neighbor_size == 0: # 0이면 일반 CF
            mean_rating = np.dot(sim_score, movie_ratings) / sim_score.sum()
        
        else:
            if len(sim_score) > 1: # 일단 비슷한 유저 자체가 자기 자신 외에 더 있어야 하니까.
                # 지정한 이웃 숫자보다 유사도 점수 대상 유저가 적을 수도 있으니까. 10 이웃 목표했는데 5명 밖에 없는 경우.
                neighbor_size = min(neighbor_size, len(sim_score))
                
                sim_score = np.array(sim_score)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_score)
                
                sim_score = sim_score[user_idx][-neighbor_size:] # 오름차순으로 정렬 했으니까... 뒤에서부터 뽑아야 함.
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                
                mean_rating = np.dot(sim_score, movie_ratings) / sim_score.sum()
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
        
    return mean_rating

In [23]:
# 모든 유저를 활용한 CF보다 미세하게 좋아졌다!
score(CF_knn, neighbor_size=30)

1.0091443487598184

In [29]:
def recom_movie(user_id, n_items, neighbor_size=0):
    user_movie = ratings_matrix.loc[user_id].copy() # u-i mat중 uid에 해당하는 행
    # <user_id, movie_id, rating>

    for movie in ratings_matrix.columns:
        if pd.notnull(user_movie[movie]): # 이미 평점을 매긴 영화는 제외
            user_movie[movie] = 0 
        else: # 평점이 없다면 CF_knn을 적용하여 값을 채워 넣기.
            user_movie.loc[movie] = CF_knn(user_id, movie, neighbor_size) # CF_knn을 통해 예측한 평점을 반영
    
    movie_sort = user_movie.sort_values(ascending=False)[:n_items] # 평점이 높은 순으로 정렬
    recom_movies = items.loc[movie_sort.index] # 영화 정보를 가져옴.
    return recom_movies

In [31]:
recom_movie(1, 10, neighbor_size=30)["title"] # 이런 것들 추천해줄거야

movie_id
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
1467                                     Cure, The (1995)
1189                              That Old Feeling (1997)
1656                                        Target (1995)
173                        Raiders of the Lost Ark (1981)
1449                               Golden Earrings (1947)
1443                                That Darn Cat! (1965)
1594                                      Shopping (1994)
318                       Everyone Says I Love You (1996)
64                     What's Eating Gilbert Grape (1993)
Name: title, dtype: object

In [33]:
# 대강 neighbor_size를 10~100까지 바꿔가면서 RMSE를 측정해보면 어느 정도로 해야 할지 감이 옴.
# 해보니까 35~45 구간 사이가 가장 좋은 것 같다.
for size in [*range(10, 100, 10)]:
    print(score(CF_knn, neighbor_size=size))

1.02869478287237
1.0122489273614521
1.0091443487598184
1.0083936390808643
1.0087410376036225
1.0091861884650746
1.0100676052990607
1.0106995316339382
1.0114986313231975
