# Exp 14. 영화 추천

### 1) 라이브러리 및 데이터 불러오기

In [1]:
import os
import pandas as pd
import numpy as np

rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [4]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [5]:
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings = pd.merge(ratings, movies)
ratings = ratings[["user_id", "movie_id", "title", "counts"]]
ratings.sort_values('user_id')

Unnamed: 0,user_id,movie_id,title,counts
0,1,1193,One Flew Over the Cuckoo's Nest (1975),5
31113,1,2294,Antz (1998),4
31674,1,3186,"Girl, Interrupted (1999)",4
32044,1,1566,Hercules (1997),4
32415,1,588,Aladdin (1992),4
...,...,...,...,...
657728,6040,334,Vanya on 42nd Street (1994),4
393446,6040,1294,M*A*S*H (1970),4
253075,6040,994,Big Night (1996),3
127665,6040,2396,Shakespeare in Love (1998),3


### 2) 분석

In [7]:
print('ratings에 있는 유니크한 영화 개수 : {}'.format(ratings['movie_id'].nunique()))
print('ratings에 있는 유니크한 사용자 수 : {}'.format(ratings['user_id'].nunique()))

ratings에 있는 유니크한 영화 개수 : 3628
ratings에 있는 유니크한 사용자 수 : 6039


In [8]:
movie_cnt = ratings.groupby('title')['user_id'].count()
movie_cnt.sort_values(ascending=False).head(30)

title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                                  2257
Pr

### 3) 선호하는 영화

In [9]:
favorite_movie_genre = movies.loc[movies['genre'].str.contains('Fantasy')].copy()
favorite_movie_genre['title'].unique()

array(['Jumanji (1995)', 'Kids of the Round Table (1995)',
       'Indian in the Cupboard, The (1995)',
       'NeverEnding Story III, The (1994)', 'Heavenly Creatures (1994)',
       "Kid in King Arthur's Court, A (1995)",
       'Star Wars: Episode IV - A New Hope (1977)',
       'Santa Clause, The (1994)', 'Mask, The (1994)',
       'Pagemaster, The (1994)', 'Dragonheart (1996)', 'Space Jam (1996)',
       'Nutty Professor, The (1996)', 'Hungarian Fairy Tale, A (1987)',
       'Kazaam (1996)', 'Bogus (1996)', 'Escape to Witch Mountain (1975)',
       '20,000 Leagues Under the Sea (1954)',
       'Willy Wonka and the Chocolate Factory (1971)',
       'E.T. the Extra-Terrestrial (1982)', 'Drop Dead Fred (1991)',
       'Warriors of Virtue (1997)', 'Simple Wish, A (1997)',
       'FairyTale: A True Story (1997)', 'Flubber (1997)',
       'Star Kid (1997)', 'Borrowers, The (1997)',
       'Quest for Camelot (1998)', 'Small Soldiers (1998)',
       'Labyrinth (1986)', 'Goonies, The (1985

In [10]:
favorite_movie_title = ['Jumanji (1995)',
                        'Star Wars: Episode IV - A New Hope (1977)',
                        'Dragonheart (1996)',
                        'Star Wars: Episode I - The Phantom Menace (1999)',
                        'Mask, The (1994)']
favorite_movies = pd.DataFrame({'user_id': ['6039']*5, 'title': favorite_movie_title, 
                                'movie_id': movies[movies['title'].isin(favorite_movie_title)]['movie_id'], 'counts':[5]*5})
favorite_movies

Unnamed: 0,user_id,title,movie_id,counts
1,6039,Jumanji (1995),2,5
257,6039,Star Wars: Episode IV - A New Hope (1977),260,5
363,6039,Dragonheart (1996),367,5
647,6039,Star Wars: Episode I - The Phantom Menace (1999),653,5
2559,6039,"Mask, The (1994)",2628,5


In [11]:
if not ratings.isin({'user_id':['6039']})['user_id'].any():
    ratings = ratings.append(favorite_movies, ignore_index=True)

ratings.tail(10)

Unnamed: 0,user_id,movie_id,title,counts
836473,5851,3607,One Little Indian (1973),5
836474,5854,3026,Slaughterhouse (1987),4
836475,5854,690,"Promise, The (Versprechen, Das) (1994)",3
836476,5938,2909,"Five Wives, Three Secretaries and Me (1998)",4
836477,5948,1360,Identification of a Woman (Identificazione di ...,5
836478,6039,2,Jumanji (1995),5
836479,6039,260,Star Wars: Episode IV - A New Hope (1977),5
836480,6039,367,Dragonheart (1996),5
836481,6039,653,Star Wars: Episode I - The Phantom Menace (1999),5
836482,6039,2628,"Mask, The (1994)",5


In [12]:
user_unique = ratings['user_id'].unique()
movie_unique = ratings['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [13]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data
else:
    print('user_id column indexing Fail!!')

temp_movie_data = ratings['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('movie_id column indexing OK!!')
    ratings['movie_id'] = temp_movie_data
else:
    print('movie_id column indexing Fail!!')

ratings

user_id column indexing OK!!
movie_id column indexing OK!!


Unnamed: 0,user_id,movie_id,title,counts
0,0,0,One Flew Over the Cuckoo's Nest (1975),5
1,1,0,One Flew Over the Cuckoo's Nest (1975),5
2,2,0,One Flew Over the Cuckoo's Nest (1975),4
3,3,0,One Flew Over the Cuckoo's Nest (1975),4
4,4,0,One Flew Over the Cuckoo's Nest (1975),5
...,...,...,...,...
836478,6039,513,Jumanji (1995),5
836479,6039,44,Star Wars: Episode IV - A New Hope (1977),5
836480,6039,173,Dragonheart (1996),5
836481,6039,60,Star Wars: Episode I - The Phantom Menace (1999),5


### 4) CSR matrix

In [14]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings['counts'], (ratings['user_id'], ratings['movie_id'])), shape=(num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

### 5) 모델 생성 및 훈련

In [15]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [16]:
als_model = AlternatingLeastSquares(factors=200, regularization=0.01, use_gpu=False, iterations=30, dtype=np.float32)

In [17]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [18]:
als_model.fit(csr_data_transpose)

  0%|          | 0/30 [00:00<?, ?it/s]

### 6) 선호도 파악

In [19]:
my, jumanji = user_to_idx[6039], movie_to_idx['Jumanji (1995)']
my_vector, jumanji_vector = als_model.user_factors[my], als_model.item_factors[jumanji]

In [20]:
my_vector

array([-0.31249332,  0.12718166,  0.26611   , -0.18086724,  0.07412572,
        1.6267239 , -1.6363617 , -0.40628105,  1.3532284 , -1.1189134 ,
        0.88072914, -0.7983218 , -1.3761437 , -0.13419305, -0.92796296,
        0.254639  ,  0.24736312, -2.4984725 , -0.48466623, -0.49043107,
        0.7694245 , -2.806112  ,  0.06562494, -0.4589703 ,  0.6034861 ,
        2.8480961 , -1.8063004 ,  0.24308814,  0.27633858, -0.97311014,
       -1.8330153 ,  0.7777671 ,  2.666883  ,  1.406802  , -0.48624295,
        1.6862243 , -1.2554765 ,  1.1107799 , -0.02632294, -0.70589083,
       -0.03626778, -0.38931218, -0.02005378,  2.1779695 ,  1.8284215 ,
        0.01565927,  1.9333401 ,  0.63492703,  0.54141885, -0.8414308 ,
       -2.5609214 , -0.61087275,  1.2628998 ,  2.189325  , -2.936416  ,
       -1.9410176 ,  0.58689576,  0.19538976,  1.9985044 ,  0.9040436 ,
       -0.8568584 , -2.3671799 ,  1.6594874 , -0.9862287 ,  0.21653551,
        1.9603248 ,  0.12585112, -1.254277  , -3.4754791 , -0.76

In [21]:
jumanji_vector

array([ 1.10664954e-02, -1.60373573e-03,  2.53787283e-02, -3.64773674e-03,
        8.78028758e-03, -4.15557949e-03,  8.30163714e-03,  1.15698995e-02,
       -1.24081634e-02, -5.15020499e-03,  6.87358016e-03, -3.36710154e-03,
        1.31974956e-02, -2.96138357e-02, -1.62001029e-02,  9.62160435e-03,
       -2.93142591e-02,  6.76875142e-03,  9.52741317e-03, -2.20155008e-02,
        9.24298633e-03,  3.95818241e-03,  5.29975863e-03, -5.16238669e-03,
       -1.00114918e-03,  4.67356807e-03,  4.90128202e-03,  6.25450676e-03,
        2.88378284e-03,  1.15115130e-02,  8.07188731e-03,  8.40836763e-03,
        2.33238190e-02,  7.31994864e-03, -1.22715840e-02,  1.69586726e-02,
        1.39475856e-02, -1.85197871e-03, -6.00552943e-04,  1.29291480e-02,
        1.87520944e-02,  7.82776158e-03,  9.32002813e-03,  1.77944638e-03,
       -1.32733220e-02, -1.80650852e-03, -3.03523857e-02,  3.21450233e-02,
        1.88847762e-02, -1.34796964e-03,  8.40489566e-03,  2.02775467e-02,
       -1.69746075e-02,  

In [22]:
np.dot(my_vector, jumanji_vector)

0.06560924

### 7) 선택한 영화와 비슷한 영화 추천

In [23]:
favorite_movie = 'Jumanji (1995)'
movie_id = movie_to_idx['Jumanji (1995)']
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(513, 0.99999994),
 (596, 0.6781126),
 (1130, 0.6020307),
 (173, 0.58874166),
 (828, 0.5225777),
 (1736, 0.48039496),
 (545, 0.4610989),
 (1982, 0.46020842),
 (1985, 0.45647198),
 (935, 0.450898),
 (576, 0.4475639),
 (942, 0.4297157),
 (1733, 0.42957133),
 (687, 0.42790833),
 (2017, 0.42218864)]

In [24]:
idx_to_title = {v:k for k,v in movie_to_idx.items()}
[idx_to_title[i[0]] for i in similar_movie]

['Jumanji (1995)',
 'Hook (1991)',
 'Indian in the Cupboard, The (1995)',
 'Dragonheart (1996)',
 'Flubber (1997)',
 'Small Soldiers (1998)',
 'Santa Clause, The (1994)',
 'Space Jam (1996)',
 'NeverEnding Story II: The Next Chapter, The (1990)',
 'Casper (1995)',
 'Escape to Witch Mountain (1975)',
 'Legend (1985)',
 'Borrowers, The (1997)',
 'Simple Wish, A (1997)',
 'Pagemaster, The (1994)']

In [25]:
def get_similar_movie(movie_title: str):
    movie_id = movie_to_idx[movie_title]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_title[i[0]] for i in similar_movie]
    return similar_movie

In [26]:
get_similar_movie('Dragonheart (1996)')

['Dragonheart (1996)',
 'Jumanji (1995)',
 'Hook (1991)',
 'Borrowers, The (1997)',
 'Indian in the Cupboard, The (1995)',
 'NeverEnding Story II: The Next Chapter, The (1990)',
 'Legend (1985)',
 'Flubber (1997)',
 'Space Jam (1996)',
 'Small Soldiers (1998)']

### 8) 좋아할 만한 영화 추천

In [27]:
user = user_to_idx[6039]

movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(1054, 1.3170713),
 (1455, 1.0216215),
 (8, 0.9308334),
 (658, 0.91855264),
 (117, 0.9154941),
 (14, 0.7917276),
 (1398, 0.7850543),
 (2032, 0.74985003),
 (619, 0.7352221),
 (1284, 0.715415),
 (1094, 0.69715947),
 (1096, 0.66916096),
 (46, 0.6584852),
 (536, 0.6570815),
 (19, 0.6551836),
 (911, 0.6548914),
 (1385, 0.65431565),
 (161, 0.65396047),
 (10, 0.6457759),
 (29, 0.64138055)]

In [28]:
[idx_to_title[i[0]] for i in movie_recommended]

['Some Like It Hot (1959)',
 'Holiday Inn (1942)',
 'Snow White and the Seven Dwarfs (1937)',
 'White Christmas (1954)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Sound of Music, The (1965)',
 'South Pacific (1958)',
 'On the Town (1949)',
 'Alice in Wonderland (1951)',
 'It Happened One Night (1934)',
 'Double Indemnity (1944)',
 'Manchurian Candidate, The (1962)',
 'Dumbo (1941)',
 'Jungle Book, The (1967)',
 'Big (1988)',
 'Sleeper (1973)',
 'Spellbound (1945)',
 'Graduate, The (1967)',
 'Beauty and the Beast (1991)',
 'Close Shave, A (1995)']

### 9) 기여도 확인

In [29]:
movie_id = movie_to_idx['Hook (1991)']

explain = als_model.explain(user, csr_data, itemid=movie_id)
[(idx_to_title[i[0]], i[1]) for i in explain[1]]

[('Guys and Dolls (1955)', 0.053160190207180026),
 ('Close Encounters of the Third Kind (1977)', 0.040466521369167126),
 ('Wings of Desire (Der Himmel über Berlin) (1987)', 0.040413250798667075),
 ('Cool Hand Luke (1967)', 0.03865806974876154),
 ('Mister Roberts (1955)', 0.036200749538347614),
 ('Pocahontas (1995)', 0.035543804770057925),
 ('Little Shop of Horrors (1986)', 0.03371443987726105),
 ('Galaxy Quest (1999)', 0.032943629600979435),
 ('Producers, The (1968)', 0.03208598758019823),
 ('Beetlejuice (1988)', 0.03191043046986807)]

### 10) 회고

- 정확도가 엄청 뛰어나진 않음
- 그럭저럭 추천해 주는 것 같음