In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
# 통계량 계산이나 기본적인 데이터분석을 위한 패키지
import scipy as sp
import scipy.stats

#시각화 패키지
import matplotlib as mpl
import matplotlib.pyplot as plt

#시각화에서 한글을 사용하기 위한 설정
import platform
from matplotlib import font_manager, rc

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
    
#윈도우의 경우
elif platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)

#시각화에서 음수를 표현하기 위한 설정
import matplotlib

matplotlib.rcParams['axes.unicode_minus'] = False 

# Jupyter Notebook의 출력을 소수점 이하 3자리로 제한 
%precision 3

# precision은 소수점은 과학적 표기법으로 변환할 자릿수를 설정
# 아래와 같이 하면 소수점 셋째 자리 밑으로는 과학적 표기법으로 표시
pd.options.display.precision = 3

# 경고창 지우기 - 경진대회에서만 사용
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 파이썬≥3.5 필수
import sys
assert sys.version_info >= (3, 5)

# 공통 모듈 임포트
import numpy as np
import pandas as pd
import os

# 깔끔한 그래프 출력을 위해 
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

mpl.rcParams['axes.unicode_minus'] = False

# Jupyter Notebook의 출력을 소수점 이하 3자리로 제한
%precision 3

import seaborn as sns

import scipy as sp
from scipy import stats

# 사이킷런 ≥0.20 필수
import sklearn
assert sklearn.__version__ >= "0.20"

# 노트북 실행 결과를 동일하게 유지하기 위해
# np.random.seed(42)

# 아이템 기반 추천 시스템

In [6]:
# 데이터 읽어오기
movies = pd.read_csv('data_ML/movielens/movies.csv')
ratings = pd.read_csv('data_ML/movielens/ratings.csv')
tags = pd.read_csv('data_ML/movielens/tags.csv')

In [11]:
print(ratings.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int64  
 1   movieId  100836 non-null  int64  
 2   rating   100836 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB
None


In [12]:
#ratings 에서 timestamp 제거
ratings = ratings[['userId', 'movieId', 'rating']]
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [13]:
# 추천 시스템 데이터 형태
# userid 별로 상품id 가 펼쳐지거나  상품 id 별로 유저 id 가 펼쳐지는 형태
# userid 를 index로 해서  movieID 별로 평점을 확인 할 수 있도록 변경
ratings_matrix = ratings.pivot_table('rating', index = 'userId', columns = 'movieId')
ratings_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [14]:
# 컬럼에 영화 제목이 아니라 movieId 가 출력 - movieId를 영화 제목으로 변경
# movies 와 ratings 를 movieId 컬럼을 기준으로 join(merge)
ratings_movies = pd.merge(ratings, movies, on = 'movieId')
ratings_movies.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [16]:
ratings_matrix = ratings_movies.pivot_table('rating', index = 'userId', columns  = 'title')
ratings_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [18]:
# 결측치 0으로 채우기
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# 행렬으 펼칠 떄 유저 별로 유사도를 계산할 지 영화 별로 유사도를 계산할지에 따라서
# 펼쳐내는 방향이 달라집니다.
# 유사도를 계산할 항복이 인덱스로 존재하면 됩니다.
ratings_matrix_T = ratings_matrix.T
ratings_matrix_T.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# 코사인 유사도
from sklearn.metrics.pairwise import cosine_similarity

item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)
print(item_sim)

[[1.    0.    0.    ... 0.327 0.    0.   ]
 [0.    1.    0.707 ... 0.    0.    0.   ]
 [0.    0.707 1.    ... 0.    0.    0.   ]
 ...
 [0.327 0.    0.    ... 1.    0.    0.   ]
 [0.    0.    0.    ... 0.    1.    0.   ]
 [0.    0.    0.    ... 0.    0.    1.   ]]


In [22]:
# 영화 제목 붙이기
item_sim_df = pd.DataFrame(data = item_sim, index = ratings_matrix.columns,
                          columns = ratings_matrix.columns)
item_sim_df.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142,0.0,...,0.0,0.342,0.543,0.707,0.0,0.0,0.139,0.327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.707,1.0,0.0,0.0,0.0,0.177,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,1.0,0.857,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.857,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
#Godfather, The(1972) 와 가장 유사한 영화 5개 추출
item_sim_df['Godfather, The (1972)'].sort_values(ascending = False)[1:6]

title
Godfather: Part II, The (1974)               0.822
Goodfellas (1990)                            0.665
One Flew Over the Cuckoo's Nest (1975)       0.621
Star Wars: Episode IV - A New Hope (1977)    0.595
Fargo (1996)                                 0.589
Name: Godfather, The (1972), dtype: float64

In [None]:
# 아이템 기반 최근접 이웃 협업 필터링으로 개인화 된 영화 추천
# 아이템 기반의 영화 유사도 데이터는 모든 사용자의 평점을 기준으로 영화의
# 유사도를 생성했고 이를 이용해서 영화를 추천할 수는 있지만 
# 이는 개인의 취향을 전혀 반영하지 않음
# 개인화된 영화 추천은 유저가 아직 관람하지 않은 영화를 추천해야합니다.
# 아직 관람하지 않은 영화에 대해서 아이템 유사도와 기존에 관람한 영화의 평점 데이터를 기반으로 해서
# 모든 영화의 평점을 에측하고 그 중에서 높은 평점을 가진 영화를 추천 

# 계산식 : 사용자가 본 영화에 대한 실제 평점과 다른 모든 영화와의 코사인 유사도를
# 내적 곱을 하고 그 값을 전체 합으로 나눔

In [27]:
# 사용자 별로 평점을 예측해주는 함수
def predict_rating(ratings_arr, item_sim_arr) :
    ratings_pred = ratings_arr.dot(item_sim_arr) / np.array([np.abs(item_sim_arr).sum(axis = 1)])
    return ratings_pred

In [28]:
ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)
ratings_pred_matrix = pd.DataFrame(data = ratings_pred,
                                  index = ratings_matrix.index,
                                  columns = ratings_matrix.columns)
ratings_pred_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.07,0.578,0.322,0.227,0.207,0.195,0.25,0.103,0.157,0.178,...,0.114,0.182,0.134,0.129,0.006,0.212,0.193,0.136,0.293,0.72
2,0.018,0.043,0.019,0.0,0.0,0.036,0.013,0.002,0.032,0.015,...,0.016,0.021,0.02,0.016,0.05,0.015,0.022,0.025,0.018,0.0
3,0.012,0.03,0.064,0.004,0.004,0.003,0.015,0.002,0.006,0.006,...,0.007,0.012,0.012,0.012,0.0,0.008,0.007,0.009,0.01,0.085
4,0.049,0.278,0.16,0.207,0.31,0.042,0.13,0.116,0.1,0.097,...,0.051,0.076,0.056,0.054,0.008,0.159,0.101,0.062,0.146,0.231
5,0.007,0.067,0.042,0.014,0.025,0.018,0.026,0.019,0.022,0.019,...,0.01,0.022,0.013,0.012,0.0,0.026,0.024,0.018,0.028,0.052


In [29]:
# 실제 데이터와의 차이
# 평가
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual) :
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)
print('현재 MSE :', get_mse(ratings_pred, ratings_matrix.values))

현재 MSE : 9.895354759094706


In [None]:
# 추천 수정
# 현재는 모든 영화와의 유사도를 이용해서 평점을 예측했는데 모든 영화보다는 
# 유저가 본 영화 중 유사도가 가장 높은 영화 몇개를 이용해서 예측하는 것이 더 나을 가능성이 높음


In [33]:
# 평점 예측 함수를 수정
# 유사도를 계산할 영화의 개수를 매개변수로 추가
def predict_rating_topsim(ratings_arr, item_sim_arr, n = 20) :
    #  사용자 - 아이템 평점 행렬 크기만큼 0으로 채운 배열을 생성
    pred = np.zeros(ratings_arr.shape)
    
    # 행렬을 순회
    for col in range(ratings_arr.shape[1]) :
        #argsort 는 하게 되면 오름차순 정렬이 되므로
        # 유사도가 높은 데이터를 가져올려면 뒤에서부터 추출을 해야합니다.
        top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1 : -1]]
        
        for row in range(ratings_arr.shape[0]) :
            pred[row, col] = item_sim_arr[col , :][top_n_items].dot(
                ratings_arr[row, :][top_n_items].T)
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
    return pred

In [34]:
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values,
                                    n = 20)
print('MSE:', get_mse(ratings_pred, ratings_matrix.values))

MSE: 3.695009387428144


In [36]:
# 이전에는 9점대였는데 3점 대로 개선됨
# 아이템이나 유저를 가지고 예측을 할 때 모든 데이터를 사용하는 것보다는
# 유사도가 높은 데이터 몇 개를 이용해서 예측을 하는 것이 성능이 좋은 경우가 많습니다.

# 개선된 데이터로 데이터 프레임을 생성
ratings_pred_matrix = pd.DataFrame(data = ratings_pred, index = ratings_matrix.index,
                                  columns = ratings_matrix.columns)
ratings_pred_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.221,0.0,0.0,1.677,0.284
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.221,0.0,0.0,0.195,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# 한 명의 유저를 선택해서 예측 평점이 높은 데이터를 확인
user_rating_id = ratings_matrix.loc[9, :]
user_rating_id[user_rating_id > 0].sort_values(ascending = False)[:10]

title
Adaptation (2002)                                                                 5.0
Citizen Kane (1941)                                                               5.0
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    5.0
Producers, The (1968)                                                             5.0
Lord of the Rings: The Two Towers, The (2002)                                     5.0
Lord of the Rings: The Fellowship of the Ring, The (2001)                         5.0
Back to the Future (1985)                                                         5.0
Austin Powers in Goldmember (2002)                                                5.0
Minority Report (2002)                                                            4.0
Witness (1985)                                                                    4.0
Name: 9, dtype: float64

In [40]:
# 유저가 보지 않은 영화 목록을 리턴하는 함수
def get_unseen_movies(ratings_matrix, userId) :
    # 유저가 본 영화 추출
    user_rating = ratings_matrix.loc[userId, :]
    #user_rating 값이 0 보다 크면 이미 본 영화
    already_seen =  user_rating[user_rating > 0].index.tolist()
    #영화 제목을 list로 생성
    movies_list = ratings_matrix.columns.tolist()
    #영화 목록 중에서 already_seen 에 속하지 않는 영화만 추출해서 리턴
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [41]:
# 유저가 보지 않은 영화 목록에서 예측 평점이 높은 영화 제목을 리턴하는 함수
def recomm_movie_by_userId(pred_df, userId, unseen_list, top_n = 10) :
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending = False)[: top_n]
    
    return recomm_movies
    

In [44]:
# 유저가 보지 않은 영화 목롤 추출
unseen_list = get_unseen_movies(ratings_matrix, 9)
# unseen_list

# 보지 않은 영화 중에서 예측 평점이 높은 10개의 영화 추천 목록 확인
recomm_movies = recomm_movie_by_userId(ratings_pred_matrix, 9,
                                       unseen_list, top_n = 10)
print(recomm_movies)

title
Shrek (2001)                                                                                      0.866
Spider-Man (2002)                                                                                 0.858
Last Samurai, The (2003)                                                                          0.817
Indiana Jones and the Temple of Doom (1984)                                                       0.817
Matrix Reloaded, The (2003)                                                                       0.801
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)    0.765
Gladiator (2000)                                                                                  0.741
Matrix, The (1999)                                                                                0.733
Pirates of the Caribbean: The Curse of the Black Pearl (2003)                                     0.690
Lord of the Rings: The Return of the King, The (2003)     

In [None]:
# 작업 과정
# 1. 각 영화 간의 유사도를 측정
# 2. 유저가 매긴 평점을 기바능로 해서 유사도가 높은 20개의 영화를 추출해서
# 그 영화들의 평점을 가지고 보지 않은 영화의 평점을 예측한 것


# 행렬 분해

## SVD

In [45]:
from numpy.linalg import svd

A = np.array([[3, -1], [1, 3], [1, 0]])

# 세 번째 행렬은 전치된 상태로 나오기 때문에 일반적으로 변수명을 만들 때 T 를 추가
U, S, VT = svd(A)
print(U) # 행이 3개 이므로 3*3 행렬
print(S) # 2개의 특이값
print(VT) # 열이 2개 이므로 2*2 행렬

[[-9.045e-01  3.162e-01 -2.860e-01]
 [-3.015e-01 -9.487e-01 -9.535e-02]
 [-3.015e-01 -1.082e-17  9.535e-01]]
[3.317 3.162]
[[-1. -0.]
 [-0. -1.]]


In [49]:
# print(U @ S @ VT) # 행렬의 연산이 안됨 크기가 달라서 
# 행렬의 곱을 해서 원본 복원을 하고자 하는 경우는 S 를 U 와 연산을 할 수 있는 구조로 변환
temp = np.diag(S, 1)[:, 1:] # 1열 끄집어 내기
print( U @ temp @ VT) # 마지막 0 자리에 값이 채워졌다!

[[ 3.00e+00 -1.00e+00]
 [ 1.00e+00  3.00e+00]
 [ 1.00e+00  3.42e-17]]


In [None]:
# 