## 프로젝트 - Movielens 영화 추천 실습

### 1. 데이터 준비와 전처리

In [1]:
import pandas as pd
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
using_cols = ['user_id', 'movie_id', 'ratings']
ratings = ratings[using_cols]
ratings.head()

Unnamed: 0,user_id,movie_id,ratings
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [5]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
movies['title'] = movies['title'].str.lower() # 검색을 쉽게 하기 위해 아티스트 문자열을 소문자로 바꿔줍시다.
movies['genre'] = movies['genre'].str.lower()
movies.head(10)

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),animation|children's|comedy
1,2,jumanji (1995),adventure|children's|fantasy
2,3,grumpier old men (1995),comedy|romance
3,4,waiting to exhale (1995),comedy|drama
4,5,father of the bride part ii (1995),comedy
5,6,heat (1995),action|crime|thriller
6,7,sabrina (1995),comedy|romance
7,8,tom and huck (1995),adventure|children's
8,9,sudden death (1995),action
9,10,goldeneye (1995),action|adventure|thriller


### 2. 분석해보기

In [8]:
# ratings에 있는 유니크한 영화 개수
ratings['movie_id'].nunique()

3628

In [9]:
# ratings에 있는 유니크한 사용자 수
ratings['user_id'].nunique()

6039

In [10]:
# dataFrame 합치기
ratings = pd.merge(ratings, movies)
ratings

Unnamed: 0,user_id,movie_id,counts,title,genre
0,1,1193,5,one flew over the cuckoo's nest (1975),drama
1,2,1193,5,one flew over the cuckoo's nest (1975),drama
2,12,1193,4,one flew over the cuckoo's nest (1975),drama
3,15,1193,4,one flew over the cuckoo's nest (1975),drama
4,17,1193,5,one flew over the cuckoo's nest (1975),drama
...,...,...,...,...,...
836473,5851,3607,5,one little indian (1973),comedy|drama|western
836474,5854,3026,4,slaughterhouse (1987),horror
836475,5854,690,3,"promise, the (versprechen, das) (1994)",romance
836476,5938,2909,4,"five wives, three secretaries and me (1998)",documentary


In [11]:
# 가장 인기 있는 영화 15개(인기순)
movie_count = ratings.groupby('title')['user_id'].count()
movie_count.sort_values(ascending=False).head(15)

title
american beauty (1999)                                   3211
star wars: episode iv - a new hope (1977)                2910
star wars: episode v - the empire strikes back (1980)    2885
star wars: episode vi - return of the jedi (1983)        2716
saving private ryan (1998)                               2561
terminator 2: judgment day (1991)                        2509
silence of the lambs, the (1991)                         2498
raiders of the lost ark (1981)                           2473
back to the future (1985)                                2460
matrix, the (1999)                                       2434
jurassic park (1993)                                     2413
sixth sense, the (1999)                                  2385
fargo (1996)                                             2371
braveheart (1995)                                        2314
men in black (1997)                                      2297
Name: user_id, dtype: int64

### 3. 내가 선호하는 영화를 5가지 골라서 ratings에 추가해 주기.

In [12]:
favorite_movie = ['men in black (1997)', 'terminator 2: judgment day (1991)', 'sixth sense, the (1999)', 'matrix, the (1999)', 'jumanji (1995)']

my_playlist = pd.DataFrame({'user_id': ['jjanggu']*5, 'title':favorite_movie, 'counts':[5]*5})

if not ratings.isin({'user_id':['jjanggu']})['user_id'].any():
    ratings = ratings.append(my_playlist)
    
ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,title,genre
836473,5851,3607.0,5,one little indian (1973),comedy|drama|western
836474,5854,3026.0,4,slaughterhouse (1987),horror
836475,5854,690.0,3,"promise, the (versprechen, das) (1994)",romance
836476,5938,2909.0,4,"five wives, three secretaries and me (1998)",documentary
836477,5948,1360.0,5,identification of a woman (identificazione di ...,drama
0,jjanggu,,5,men in black (1997),
1,jjanggu,,5,terminator 2: judgment day (1991),
2,jjanggu,,5,"sixth sense, the (1999)",
3,jjanggu,,5,"matrix, the (1999)",
4,jjanggu,,5,jumanji (1995),


In [13]:
# 결측치가 있는 movie_id, genre 제거해주기
using_cols = ['user_id', 'title','counts']
ratings = ratings[using_cols]
ratings.tail(10)

Unnamed: 0,user_id,title,counts
836473,5851,one little indian (1973),5
836474,5854,slaughterhouse (1987),4
836475,5854,"promise, the (versprechen, das) (1994)",3
836476,5938,"five wives, three secretaries and me (1998)",4
836477,5948,identification of a woman (identificazione di ...,5
0,jjanggu,men in black (1997),5
1,jjanggu,terminator 2: judgment day (1991),5
2,jjanggu,"sixth sense, the (1999)",5
3,jjanggu,"matrix, the (1999)",5
4,jjanggu,jumanji (1995),5


In [14]:
# 고유한 유저, 아티스트를 찾아내는 코드
user_unique = ratings['user_id'].unique()
title_unique = ratings['title'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
title_to_idx = {v:k for k,v in enumerate(title_unique)}

In [15]:
# user_id의 jjanggu가 잘 들어왔는지 확인하기. -> 원래는 6039였음.
ratings['user_id'].nunique()

6040

In [16]:
# 인덱싱이 잘 되었는지 확인해 봅니다. 
print(user_to_idx['jjanggu'])    # 6040명의 유저 중 마지막으로 추가된 유저이니 6039가 나와야 합니다. 
print(title_to_idx['matrix, the (1999)'])

6039
124


In [17]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_title_data = ratings['title'].map(title_to_idx.get).dropna()
if len(temp_title_data) == len(ratings):
    print('title column indexing OK!!')
    ratings['title'] = temp_title_data
else:
    print('title column indexing Fail!!')

ratings

user_id column indexing OK!!
title column indexing OK!!


Unnamed: 0,user_id,title,counts
0,0,0,5
1,1,0,5
2,2,0,4
3,3,0,4
4,4,0,5
...,...,...,...
0,6039,175,5
1,6039,92,5
2,6039,38,5
3,6039,124,5


### 4. CSR matrix 직접 만들어보기

In [18]:
# 실습 위에 설명보고 이해해서 만들어보기
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_title = ratings['title'].nunique()

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.title)), shape= (num_user, num_title))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

### 5. als_model = AlternatingLeastSquares 모델을 구성하여 훈련하기

In [19]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [22]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [23]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [24]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

### 6. 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해보자

In [26]:
jjanggu, matrix = user_to_idx['jjanggu'], title_to_idx['matrix, the (1999)']
jjanggu_vector, matrix_vector = als_model.user_factors[jjanggu], als_model.item_factors[matrix]

print('슝=3')

슝=3


In [27]:
jjanggu_vector

array([ 0.7754529 ,  0.09502576, -0.26743177, -0.00837536, -0.4278872 ,
       -0.6109361 , -1.3638098 ,  1.1258903 ,  1.2325747 , -0.16108908,
        0.31450543,  0.38356262, -0.82050467,  0.0159059 ,  0.22194964,
        0.2405438 , -0.8852333 , -0.8176474 ,  0.6586105 , -0.7490114 ,
       -0.31721303,  0.40842494, -0.18071318,  0.8986854 ,  0.21735537,
        0.32902712, -0.46692285, -0.18102188, -0.02323308,  0.373259  ,
       -0.01217461, -0.6008305 ,  0.65991133,  0.60673356,  0.22948346,
        0.65252304, -0.24385121,  0.44585353, -0.10261388, -0.3716366 ,
        0.01556433, -0.60477346,  0.48621804, -0.0488282 , -0.65127796,
        0.13387106,  0.8745431 , -1.2199664 ,  0.40283108,  0.32235518,
       -0.09064235, -0.5169021 ,  0.34680828,  0.89028805,  0.20698193,
        0.3549837 ,  0.35942265,  0.3795863 ,  0.3810695 , -0.45530593,
        0.24872287, -0.06946388,  0.5040696 ,  0.5699538 , -0.11583844,
       -0.43322754,  0.14376794, -0.832143  ,  0.9630044 , -0.55

In [28]:
matrix_vector

array([ 0.03260789,  0.01699837,  0.00661135,  0.01896729,  0.00740881,
        0.00939128, -0.02247234,  0.04833619,  0.0333858 , -0.00749802,
        0.00931608,  0.01357657,  0.00116186, -0.0211261 ,  0.0246095 ,
       -0.00560935, -0.01656874,  0.00183353,  0.02019135, -0.00591024,
        0.020101  ,  0.0223523 , -0.00570514,  0.03229266,  0.02446358,
        0.01580289, -0.01432711,  0.01552579, -0.02001198,  0.01688506,
        0.01029136, -0.02113801,  0.01301845,  0.01563139, -0.00777215,
        0.02319167, -0.0011389 , -0.00304867,  0.00145579, -0.00396361,
        0.00636566, -0.0290603 ,  0.01685455,  0.01585375, -0.0167153 ,
       -0.00228183,  0.01711491, -0.02873446,  0.02384799,  0.015565  ,
        0.00455681, -0.00065147, -0.00168897,  0.02893121,  0.02826177,
        0.02360362,  0.02361261,  0.02520653,  0.01631624,  0.02656337,
        0.00476165,  0.01550716,  0.00609755,  0.00748627,  0.00056069,
        0.00011827,  0.00631454, -0.02005043,  0.04253076, -0.01

In [29]:
# jjanggu와 matrix를 내적하는 코드
np.dot(jjanggu_vector, matrix_vector)

0.6746999

In [30]:
predict = title_to_idx['slaughterhouse (1987)']
predict_vector = als_model.item_factors[predict]
np.dot(jjanggu_vector, predict_vector)

0.0038817262

호러물이라 그런지 예측률이 굉장히 떨어진다.

In [31]:
predict = title_to_idx['terminator 2: judgment day (1991)']
predict_vector = als_model.item_factors[predict]
np.dot(jjanggu_vector, predict_vector)

0.73318833

액션과 관련된 영화다보니 정확도가 높게 나오는 것을 볼 수 있다.

### 7. 내가 좋아하는 영화와 비슷한 영화를 추천받기.

In [32]:
# `jumanji`로 찾아보기
favorite_title = 'jumanji (1995)'
title_id = title_to_idx[favorite_title]
similar_title = als_model.similar_items(title_id, N=15)
similar_title

[(513, 1.0000001),
 (596, 0.83387876),
 (1130, 0.80527145),
 (173, 0.7396748),
 (828, 0.696585),
 (1982, 0.67672384),
 (545, 0.65824836),
 (1736, 0.654529),
 (1733, 0.65421283),
 (1985, 0.65345156),
 (561, 0.6170842),
 (458, 0.60230774),
 (2017, 0.59314555),
 (576, 0.5915305),
 (1238, 0.5903835)]

In [33]:
#artist_to_idx 를 뒤집어, index로부터 artist 이름을 얻는 dict를 생성합니다. 
idx_to_title = {v:k for k,v in title_to_idx.items()}
[idx_to_title[i[0]] for i in similar_title]

['jumanji (1995)',
 'hook (1991)',
 'indian in the cupboard, the (1995)',
 'dragonheart (1996)',
 'flubber (1997)',
 'space jam (1996)',
 'santa clause, the (1994)',
 'small soldiers (1998)',
 'borrowers, the (1997)',
 'neverending story ii: the next chapter, the (1990)',
 'neverending story, the (1984)',
 'mask, the (1994)',
 'pagemaster, the (1994)',
 'escape to witch mountain (1975)',
 'mighty joe young (1998)']

In [34]:
def get_similar_title(title_name: str):
    title_id = title_to_idx[title_name]
    similar_title = als_model.similar_items(title_id)
    similar_title = [idx_to_title[i[0]] for i in similar_title]
    return similar_title

print("슝=3")

슝=3


In [35]:
get_similar_title('sudden death (1995)')

['sudden death (1995)',
 'terminal velocity (1994)',
 'money train (1995)',
 'glimmer man, the (1996)',
 'maximum risk (1996)',
 'firestorm (1998)',
 'fire down below (1997)',
 'substitute, the (1996)',
 'fair game (1995)',
 'surviving the game (1994)']

### 8. 내가 가장 좋아할 만한 영화들을 추천받아 보자.

In [36]:
user = user_to_idx['jjanggu']
# recommend에서는 user*item CSR Matrix를 받습니다.
title_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
title_recommended

[(107, 0.6262224),
 (60, 0.46990258),
 (62, 0.4607547),
 (141, 0.32683367),
 (87, 0.30060068),
 (200, 0.29581335),
 (75, 0.28717735),
 (121, 0.28568807),
 (458, 0.27995265),
 (145, 0.2768397),
 (233, 0.26918936),
 (44, 0.2583399),
 (1090, 0.2579115),
 (64, 0.25779283),
 (670, 0.24156149),
 (82, 0.23829117),
 (51, 0.23746763),
 (150, 0.23419665),
 (375, 0.2333191),
 (220, 0.22510773)]

In [37]:
[idx_to_title[i[0]] for i in title_recommended]

['jurassic park (1993)',
 'star wars: episode i - the phantom menace (1999)',
 'total recall (1990)',
 'fugitive, the (1993)',
 'braveheart (1995)',
 'terminator, the (1984)',
 'hunt for red october, the (1990)',
 'silence of the lambs, the (1991)',
 'mask, the (1994)',
 'fifth element, the (1997)',
 'usual suspects, the (1995)',
 'star wars: episode iv - a new hope (1977)',
 'nikita (la femme nikita) (1990)',
 'star wars: episode vi - return of the jedi (1983)',
 'galaxy quest (1999)',
 'lost world: jurassic park, the (1997)',
 'fargo (1996)',
 'independence day (id4) (1996)',
 'face/off (1997)',
 'seven (se7en) (1995)']

In [38]:
jurassic = title_to_idx['jurassic park (1993)']
explain = als_model.explain(user, csr_data, itemid=jurassic)

In [39]:
# 추천한 콘텐츠 점수에 기여한 다른 콘텐츠와 기여도를 반환.

[(idx_to_title[i[0]], i[1]) for i in explain[1]]

[('men in black (1997)', 0.2941572409607435),
 ('terminator 2: judgment day (1991)', 0.20975682922784875),
 ('matrix, the (1999)', 0.09967301609578036),
 ('jumanji (1995)', 0.016251777925469366),
 ('sixth sense, the (1999)', -0.004177837097566334)]

### 위의 모델 학습 후 예측한 선호도를 보니 0.6746999으로 생각보다 낮음을 볼 수 있다. 

### 하이퍼파라미터 factors, iterations 값을 수정(늘리기)을 좀 더 해봐야겠다..!

factors = 200, iterations = 30  ---> 0.79080987  |  예측: `terminator 2: judgment day (1991)` --> 0.8394091  <br/>
factors = 200, iterations = 50  ---> 0.79016745  |  예측: `terminator 2: judgment day (1991)` --> 0.8427951 <br/>
factors = 150, iterations = 50  ---> 0.7498393  |  예측: `terminator 2: judgment day (1991)` --> 0.8011485 <br/>
factors = 300, iterations = 50  ---> 0.89354235  |  예측: `terminator 2: judgment day (1991)` --> 0.85713214 <br/>
factors = 300, iterations = 100  ---> 0.8826167  |  예측: `terminator 2: judgment day (1991)` --> 0.8675973 <br/>
factors = 400, iterations = 100  ---> 0.9234716  |  예측: `terminator 2: judgment day (1991)` --> 0.91244197 <br/>
factors = 400, iterations = 150  ---> 0.92792326  |  예측: `terminator 2: judgment day (1991)` --> 0.90877223 <br/>
factors = 500, iterations = 100  ---> 0.95318717  |  예측: `terminator 2: judgment day (1991)` --> 0.9394018 <br/>
factors = 600, iterations = 100  ---> 0.97024214  |  예측: `terminator 2: judgment day (1991)` --> 0.9554564 <br/>
factors = 700, iterations = 100  ---> 0.97785896  |  예측: `terminator 2: judgment day (1991)` --> 0.9655837 <br/>
factors = 800, iterations = 100  ---> 0.9817859  |  예측: `terminator 2: judgment day (1991)` --> 0.9758792 <br/>
factors = 1000, iterations = 100  ---> 0.98794925  |  예측: `terminator 2: judgment day (1991)` --> 0.9887543 <br/>
factors = 1200, iterations = 100  ---> 0.99171257  |  예측: `terminator 2: judgment day (1991)` --> 0.9924315

In [104]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=1200, regularization=0.01, use_gpu=False, iterations=100, dtype=np.float32)

In [105]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [106]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/100 [00:00<?, ?it/s]

In [107]:
jjanggu, matrix = user_to_idx['jjanggu'], title_to_idx['matrix, the (1999)']
jjanggu_vector, matrix_vector = als_model.user_factors[jjanggu], als_model.item_factors[matrix]

print('슝=3')

슝=3


In [108]:
# jjanggu와 matrix를 내적하는 코드
np.dot(jjanggu_vector, matrix_vector)

0.99171257

In [109]:
predict = title_to_idx['slaughterhouse (1987)']
predict_vector = als_model.item_factors[predict]
np.dot(jjanggu_vector, predict_vector)

-0.00018347634

In [110]:
predict = title_to_idx['terminator 2: judgment day (1991)']
predict_vector = als_model.item_factors[predict]
np.dot(jjanggu_vector, predict_vector)

0.9924315

In [119]:
predict = title_to_idx['star wars: episode iv - a new hope (1977)']
predict_vector = als_model.item_factors[predict]
np.dot(jjanggu_vector, predict_vector)

0.006912405

In [111]:
# `jumanji`로 찾아보기
favorite_title = 'jumanji (1995)'
title_id = title_to_idx[favorite_title]
similar_title = als_model.similar_items(title_id, N=15)
similar_title

[(513, 1.0),
 (3626, 0.28837717),
 (3592, 0.28696677),
 (3617, 0.28227794),
 (3559, 0.2822112),
 (3373, 0.28105974),
 (3581, 0.28102311),
 (3479, 0.28026178),
 (3472, 0.27973837),
 (3518, 0.2796763),
 (3590, 0.27959472),
 (3618, 0.27889743),
 (3564, 0.27881297),
 (3578, 0.27859724),
 (3573, 0.27858725)]

In [112]:
#artist_to_idx 를 뒤집어, index로부터 artist 이름을 얻는 dict를 생성합니다. 
idx_to_title = {v:k for k,v in title_to_idx.items()}
[idx_to_title[i[0]] for i in similar_title]

['jumanji (1995)',
 'five wives, three secretaries and me (1998)',
 'dangerous game (1993)',
 'slappy and the stinkers (1998)',
 'zachariah (1971)',
 'railroaded! (1947)',
 'death in the garden (mort en ce jardin, la) (1956)',
 'yankee zulu (1994)',
 'show, the (1995)',
 'napoleon and samantha (1972)',
 'alley cats, the (1968)',
 'nemesis 2: nebula (1995)',
 'impact (1949)',
 'number seventeen (1932)',
 'male and female (1919)']

In [113]:
def get_similar_title(title_name: str):
    title_id = title_to_idx[title_name]
    similar_title = als_model.similar_items(title_id)
    similar_title = [idx_to_title[i[0]] for i in similar_title]
    return similar_title

print("슝=3")

슝=3


In [114]:
get_similar_title('sudden death (1995)')

['sudden death (1995)',
 'tough and deadly (1995)',
 'double team (1997)',
 "in god's hands (1998)",
 'power 98 (1995)',
 'make them die slowly (cannibal ferox) (1980)',
 'aiqing wansui (1994)',
 "heaven's burning (1997)",
 'quest, the (1996)',
 'project moon base (1953)']

In [115]:
user = user_to_idx['jjanggu']
# recommend에서는 user*item CSR Matrix를 받습니다.
title_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
title_recommended

[(1130, 0.06728478),
 (2017, 0.05040644),
 (1518, 0.042066798),
 (2520, 0.041966695),
 (828, 0.04061906),
 (2184, 0.04049965),
 (1369, 0.040300846),
 (2452, 0.039353944),
 (1731, 0.03917837),
 (1355, 0.038359087),
 (1907, 0.036780108),
 (1749, 0.036056366),
 (1416, 0.035977375),
 (324, 0.035880435),
 (1463, 0.03583949),
 (2196, 0.03536327),
 (62, 0.03485647),
 (616, 0.03481446),
 (1095, 0.03479961),
 (1563, 0.034284692)]

In [116]:
[idx_to_title[i[0]] for i in title_recommended]

['indian in the cupboard, the (1995)',
 'pagemaster, the (1994)',
 'free willy (1993)',
 'it came from hollywood (1982)',
 'flubber (1997)',
 'thinner (1996)',
 '101 dalmatians (1996)',
 'drop dead fred (1991)',
 'picnic at hanging rock (1975)',
 'king and i, the (1956)',
 'three days of the condor (1975)',
 'iron eagle (1986)',
 'of mice and men (1992)',
 'dead man (1995)',
 'bananas (1971)',
 'giant (1956)',
 'total recall (1990)',
 'willow (1988)',
 'lifeboat (1944)',
 'love letter, the (1999)']

In [117]:
jurassic = title_to_idx['jurassic park (1993)']
explain = als_model.explain(user, csr_data, itemid=jurassic)

In [118]:
# 추천한 콘텐츠 점수에 기여한 다른 콘텐츠와 기여도를 반환.

[(idx_to_title[i[0]], i[1]) for i in explain[1]]

[('men in black (1997)', 0.006914259566398002),
 ('terminator 2: judgment day (1991)', 0.006381819634605416),
 ('sixth sense, the (1999)', 0.004026112956231499),
 ('matrix, the (1999)', 0.0033332964534698705),
 ('jumanji (1995)', 0.001311303700204055)]

### 회고
위에서 보면 알겠지만 하이퍼파라미터 factors와 iterations 값 변경으로 인해 모델이 예측한 선호도가 상승하는 모습을 볼 수 있었다. <br/> iterations는 epoch와 같은 의미였는데 일정 수치를 넘어가니 overfitting이 발생하는 모습을 볼 수 있었다. <br/> 그리고 factors를 계속 크게 주었는데 계속 선호도가 상승하는 모습을 볼 수 있었다. <br/> user와 item 벡터의 차원을 늘리니 상승했다는건데 어떤 이유로 상승했는지는 잘 모르겠다. 앞으로 공부를 통해 점차 알아가보도록 해야겠다.