# word2vec, SVM을 이용한 영화 추천시스템

In [1]:
import json
from collections import Counter
import numpy as np
import random

# keras -> Embedding
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot

# sklearn -> LinearRegression
from sklearn.linear_model import LinearRegression

# SVM 분류기
from sklearn import svm

# 1. 데이터 불러오기

In [2]:
with open('wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

## 데이터 확인 : 총 10,000개의 영화, list와 딕셔너리 구조)

In [3]:
print(len(movies))
print(type(movies))

10000
<class 'list'>


In [4]:
movies[0]

['Deadpool (film)',
 {'image': 'Deadpool poster.jpg',
  'name': 'Deadpool',
  'cinematography': 'Ken Seng',
  'Software Used': 'Adobe Premier Pro',
  'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
  'distributor': '20th Century Fox',
  'caption': 'Theatrical release poster',
  'gross': '$783.1 million',
  'country': 'United States',
  'director': 'Tim Miller',
  'runtime': '108 minutes',
  'editing': 'Julian Clarke',
  'language': 'English',
  'music': 'Tom Holkenborg',
  'budget': '$58 million'},
 ['Tim Miller (director)',
  'Simon Kinberg',
  'Ryan Reynolds',
  'Lauren Shuler Donner',
  'Rhett Reese',
  'Paul Wernick',
  'Deadpool',
  'Fabian Nicieza',
  'Rob Liefeld',
  'Morena Baccarin',
  'Ed Skrein',
  'T.J. Miller',
  'Gina Carano',
  'Leslie Uggams',
  'Brianna Hildebrand',
  'Stefan Kapičić',
  'Junkie

In [5]:
movies[:3]

[['Deadpool (film)',
  {'image': 'Deadpool poster.jpg',
   'name': 'Deadpool',
   'cinematography': 'Ken Seng',
   'Software Used': 'Adobe Premier Pro',
   'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
   'distributor': '20th Century Fox',
   'caption': 'Theatrical release poster',
   'gross': '$783.1 million',
   'country': 'United States',
   'director': 'Tim Miller',
   'runtime': '108 minutes',
   'editing': 'Julian Clarke',
   'language': 'English',
   'music': 'Tom Holkenborg',
   'budget': '$58 million'},
  ['Tim Miller (director)',
   'Simon Kinberg',
   'Ryan Reynolds',
   'Lauren Shuler Donner',
   'Rhett Reese',
   'Paul Wernick',
   'Deadpool',
   'Fabian Nicieza',
   'Rob Liefeld',
   'Morena Baccarin',
   'Ed Skrein',
   'T.J. Miller',
   'Gina Carano',
   'Leslie Uggams',
   'Brianna Hildebrand'

## 첫번째 영화 Deadpool 구조 분석

In [6]:
# 첫번째 영화는 Deadpool
movies[0]

['Deadpool (film)',
 {'image': 'Deadpool poster.jpg',
  'name': 'Deadpool',
  'cinematography': 'Ken Seng',
  'Software Used': 'Adobe Premier Pro',
  'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
  'distributor': '20th Century Fox',
  'caption': 'Theatrical release poster',
  'gross': '$783.1 million',
  'country': 'United States',
  'director': 'Tim Miller',
  'runtime': '108 minutes',
  'editing': 'Julian Clarke',
  'language': 'English',
  'music': 'Tom Holkenborg',
  'budget': '$58 million'},
 ['Tim Miller (director)',
  'Simon Kinberg',
  'Ryan Reynolds',
  'Lauren Shuler Donner',
  'Rhett Reese',
  'Paul Wernick',
  'Deadpool',
  'Fabian Nicieza',
  'Rob Liefeld',
  'Morena Baccarin',
  'Ed Skrein',
  'T.J. Miller',
  'Gina Carano',
  'Leslie Uggams',
  'Brianna Hildebrand',
  'Stefan Kapičić',
  'Junkie

In [7]:
# Deadpool의 영화 제목
movies[0][0]

'Deadpool (film)'

In [8]:
# Deadpool의 영화 개요
movies[0][1]

{'image': 'Deadpool poster.jpg',
 'name': 'Deadpool',
 'cinematography': 'Ken Seng',
 'Software Used': 'Adobe Premier Pro',
 'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
 'distributor': '20th Century Fox',
 'caption': 'Theatrical release poster',
 'gross': '$783.1 million',
 'country': 'United States',
 'director': 'Tim Miller',
 'runtime': '108 minutes',
 'editing': 'Julian Clarke',
 'language': 'English',
 'music': 'Tom Holkenborg',
 'budget': '$58 million'}

In [9]:
# Deadpool의 외부링크(태그)들 - 총 405개
print(len(movies[0][2]))
movies[0][2]

405


['Tim Miller (director)',
 'Simon Kinberg',
 'Ryan Reynolds',
 'Lauren Shuler Donner',
 'Rhett Reese',
 'Paul Wernick',
 'Deadpool',
 'Fabian Nicieza',
 'Rob Liefeld',
 'Morena Baccarin',
 'Ed Skrein',
 'T.J. Miller',
 'Gina Carano',
 'Leslie Uggams',
 'Brianna Hildebrand',
 'Stefan Kapičić',
 'Junkie XL',
 'Julian Clarke',
 'Marvel Entertainment',
 'Kinberg Genre',
 'Lauren Shuler Donner',
 'TSG Entertainment',
 '20th Century Fox',
 'Le Grand Rex',
 'Variety (magazine)',
 'Box Office Mojo',
 'superhero film',
 'Tim Miller (director)',
 'Rhett Reese',
 'Paul Wernick',
 'Marvel Comics',
 'Deadpool',
 'X-Men (film series)',
 'Ryan Reynolds',
 'Morena Baccarin',
 'Ed Skrein',
 'T.J. Miller',
 'Gina Carano',
 'Leslie Uggams',
 'Brianna Hildebrand',
 'Stefan Kapičić',
 'antihero',
 'New Line Cinema',
 '20th Century Fox',
 'X-Men Origins: Wolverine',
 'principal photography',
 'Vancouver',
 'IMAX',
 'Digital Light Processing',
 'D-Box Technologies',
 'List of accolades received by Deadpool (

In [10]:
# 평점
movies[0][4]

'6.9/10'

## 외부 링크(태그) 소개

In [11]:
link_counts = Counter()

for movie in movies:
    link_counts.update(movie[2])  # 외부 링크 다 더하기
                                  # Counter 클래스의 update 메소드는 딕셔너리 값들을 더해준다.
        

In [12]:
link_counts

Counter({'Tim Miller (director)': 9,
         'Simon Kinberg': 33,
         'Ryan Reynolds': 114,
         'Lauren Shuler Donner': 37,
         'Rhett Reese': 12,
         'Paul Wernick': 11,
         'Deadpool': 9,
         'Fabian Nicieza': 1,
         'Rob Liefeld': 2,
         'Morena Baccarin': 16,
         'Ed Skrein': 18,
         'T.J. Miller': 12,
         'Gina Carano': 26,
         'Leslie Uggams': 9,
         'Brianna Hildebrand': 8,
         'Stefan Kapičić': 6,
         'Junkie XL': 26,
         'Julian Clarke': 11,
         'Marvel Entertainment': 28,
         'Kinberg Genre': 6,
         'TSG Entertainment': 37,
         '20th Century Fox': 1147,
         'Le Grand Rex': 12,
         'Variety (magazine)': 5450,
         'Box Office Mojo': 4186,
         'superhero film': 91,
         'Marvel Comics': 258,
         'X-Men (film series)': 16,
         'antihero': 18,
         'New Line Cinema': 408,
         'X-Men Origins: Wolverine': 15,
         'principal photography'

In [13]:
# 가장 수가 많은 태그 순으로 정렬
link_counts.most_common(10)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867),
 ('Variety (magazine)', 5450),
 ('Metacritic', 5112),
 ('Box Office Mojo', 4186),
 ('The New York Times', 3818),
 ('The Hollywood Reporter', 3553),
 ('Roger Ebert', 2707),
 ('Los Angeles Times', 2454)]

# 2. 데이터 전처리 및 피쳐 엔지니어링

In [14]:
# 외부 링크 빈도수가 3번 이상인 것들만 뽑아서 top_links 변수에 저장
top_links = [link for link, c in link_counts.items() if c >= 3]

top_links[:10]

['Tim Miller (director)',
 'Simon Kinberg',
 'Ryan Reynolds',
 'Lauren Shuler Donner',
 'Rhett Reese',
 'Paul Wernick',
 'Deadpool',
 'Morena Baccarin',
 'Ed Skrein',
 'T.J. Miller']

## 나중에 조회를 위해 (링크, 영화) 쌍  작성

In [15]:
# 총 66,913개 길이의 "링크:인덱스 딕셔너리(link_to_idx)"" 만들기
link_to_idx = {link: idx for idx, link in enumerate(top_links)}

print(len(link_to_idx))
link_to_idx

66913


{'Tim Miller (director)': 0,
 'Simon Kinberg': 1,
 'Ryan Reynolds': 2,
 'Lauren Shuler Donner': 3,
 'Rhett Reese': 4,
 'Paul Wernick': 5,
 'Deadpool': 6,
 'Morena Baccarin': 7,
 'Ed Skrein': 8,
 'T.J. Miller': 9,
 'Gina Carano': 10,
 'Leslie Uggams': 11,
 'Brianna Hildebrand': 12,
 'Stefan Kapičić': 13,
 'Junkie XL': 14,
 'Julian Clarke': 15,
 'Marvel Entertainment': 16,
 'Kinberg Genre': 17,
 'TSG Entertainment': 18,
 '20th Century Fox': 19,
 'Le Grand Rex': 20,
 'Variety (magazine)': 21,
 'Box Office Mojo': 22,
 'superhero film': 23,
 'Marvel Comics': 24,
 'X-Men (film series)': 25,
 'antihero': 26,
 'New Line Cinema': 27,
 'X-Men Origins: Wolverine': 28,
 'principal photography': 29,
 'Vancouver': 30,
 'IMAX': 31,
 'D-Box Technologies': 32,
 'Golden Globe Award': 33,
 'Golden Globe Award for Best Motion Picture – Musical or Comedy': 34,
 'Golden Globe Award for Best Actor – Motion Picture Musical or Comedy': 35,
 'Producers Guild of America Award': 36,
 "Critics' Choice Movie Awards

In [16]:
# 총 1만개 영화의 "무비:인덱스 딕셔너리(movie_to_idx)" 만들기
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}

print(len(movie_to_idx))
movie_to_idx

10000


{'Deadpool (film)': 0,
 'The Revenant (2015 film)': 1,
 'Suicide Squad (film)': 2,
 'Spectre (2015 film)': 3,
 'Rebel Without a Cause': 4,
 'Warcraft (film)': 5,
 'The Martian (film)': 6,
 'List of Marvel Cinematic Universe films': 7,
 'X-Men (film series)': 8,
 'The Hateful Eight': 9,
 'The Jungle Book (2016 film)': 10,
 'The Big Short (film)': 11,
 '10 Cloverfield Lane': 12,
 'Spotlight (film)': 13,
 'Room (2015 film)': 14,
 'Creed (film)': 15,
 'DC Universe Animated Original Movies': 16,
 'Star Trek Beyond': 17,
 'Star Wars (film)': 18,
 'Interstellar (film)': 19,
 'Ant-Man (film)': 20,
 'Everest (2015 film)': 21,
 'Jurassic World': 22,
 'Joy (film)': 23,
 'Gods of Egypt (film)': 24,
 'Star Wars sequel trilogy': 25,
 'The Conjuring 2': 26,
 'The Danish Girl (film)': 27,
 'Sicario (2015 film)': 28,
 'Rogue One': 29,
 'Finding Dory': 30,
 'Black Mass (film)': 31,
 'Blade Runner': 32,
 'Harry Potter (film series)': 33,
 'Doctor Strange (film)': 34,
 'Titanic (1997 film)': 35,
 'Furious

In [17]:
# (링크:영화) 쌍을 pairs 변수에 저장
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
    
# 0번째 영화와 1번째 영화의 링크 경계선 확인
pairs[350:360]

# 0번째 영화에는 355개의 링크(빈도수 3 이상)가 있다.

[(215, 0),
 (216, 0),
 (217, 0),
 (218, 0),
 (219, 0),
 (220, 0),
 (221, 1),
 (222, 1),
 (223, 1),
 (224, 1)]

In [18]:
# 중복값 제거
pairs_set = set(pairs)

In [19]:
print(len(pairs))          # pairs : "링크:영화"쌍 94만개 리스트
print(len(top_links))      # top_links : "링크" 수 6만개(빈도수 3이상) 리스트
print(len(movie_to_idx))   # movie_to_idx : 영화 수 1만개 딕셔너리

949544
66913
10000


## -> 데이터 전처리, 피쳐 엔지니어링 끝

# 3. 케라스 모델링 : 임베딩 학습

In [20]:
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot

# 임베딩 모델링하는 함수
def movie_embedding_model(embedding_size=50):                        # 임베딩 사이즈 = 50
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    
    link_embedding = Embedding(name='link_embedding',                # 임베딩 벡터1 - link_embedding
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)   
    
    movie_embedding = Embedding(name='movie_embedding',              # 임베딩 벡터2 - movie_embedding
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)
    
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])  # 두 임베딩 벡터의 내적
    
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')                     # 모델 컴파일 : optimizer=nadam, 손실함수=mse
    return model                                                    # 모델을 리턴한다.

model = movie_embedding_model()
model.summary()
# top_links x 임베딩 차원수 = 66913 x 50 = 3345650
# 영화 수 x 임베딩 차원수 = 10000 x 50 = 500000 

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 link (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 movie (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 link_embedding (Embedding)     (None, 1, 50)        3345650     ['link[0][0]']                   
                                                                                                  
 movie_embedding (Embedding)    (None, 1, 50)        500000      ['movie[0][0]']                  
                                                                                              

# 4. 모델 훈련

In [21]:
random.seed(5)

# batchifier(생성기)로 모델에 데이터를 넣어줌.
# positive(레이블 1), negative(레이블 0) 데이터를 섞어서 batch를 만듦

# (참고)yield는 값을 보내줄 때 사용. return과 다른 점은 함수의 종료가 아닌 지속적으로 보내줄 수 있음.
# 이 떄 값을 받을 수 있는 방식이 next
def batchifier(pairs, positive_samples=50, negative_ratio=5): # 쌍 배열에서 positive 예제로 샘플링한 다음 negative 예제로 채운다.
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:           # pairs_set에 없는지 확인
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

In [22]:
# 실행할 때마다 batch를 내보냄
next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([20558., 31254., 32318., 22418., 32643., 48731., 13365.,  1313.,
          3801.]),
  'movie': array([ 849., 5530., 7685., 1529., 7628., 1854., 6238., 7236., 5874.])},
 array([-1.,  1., -1.,  1., -1., -1., -1.,  1., -1.]))

### 딥러닝 학습

In [23]:
%%time
# 학습시간 약 8분

positive_samples_per_batch = 512

# 위에서 정의한 임베딩 모델에 딥러닝 학습시키기
model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=5),
    epochs=10,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2)

Epoch 1/10


2022-07-22 10:56:42.004082: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


1854/1854 - 17s - loss: 0.7341 - 17s/epoch - 9ms/step
Epoch 2/10
1854/1854 - 19s - loss: 0.3614 - 19s/epoch - 10ms/step
Epoch 3/10
1854/1854 - 20s - loss: 0.3362 - 20s/epoch - 11ms/step
Epoch 4/10
1854/1854 - 20s - loss: 0.3259 - 20s/epoch - 11ms/step
Epoch 5/10
1854/1854 - 20s - loss: 0.3209 - 20s/epoch - 11ms/step
Epoch 6/10
1854/1854 - 20s - loss: 0.3183 - 20s/epoch - 11ms/step
Epoch 7/10
1854/1854 - 20s - loss: 0.3160 - 20s/epoch - 11ms/step
Epoch 8/10
1854/1854 - 21s - loss: 0.3145 - 21s/epoch - 11ms/step
Epoch 9/10
1854/1854 - 20s - loss: 0.3137 - 20s/epoch - 11ms/step
Epoch 10/10
1854/1854 - 20s - loss: 0.3128 - 20s/epoch - 11ms/step
CPU times: user 9min 52s, sys: 5min 44s, total: 15min 37s
Wall time: 3min 16s


<keras.callbacks.History at 0x28fc4e700>

In [24]:
# 영화 임베딩 레이어 확인
movie = model.get_layer('movie_embedding')
movie

<keras.layers.embeddings.Embedding at 0x28d9824c0>

### 코사인 유사도 계산

In [25]:
# 영화 별 가중치
movie_weights = movie.get_weights()[0]

In [26]:
print(len(movie_weights))   # 영화 10000개 각각에 대한 가중치
movie_weights      

10000


array([[ 0.18828699,  0.0954465 ,  0.02271672, ..., -0.12425935,
         0.0759232 , -0.04206158],
       [ 0.14639054,  0.11691933,  0.03478591, ..., -0.03161107,
        -0.01138462,  0.07413036],
       [ 0.19277011,  0.09314025,  0.06160346, ..., -0.08692326,
         0.06950577,  0.01022679],
       ...,
       [ 0.04801939,  0.10767315,  0.05221514, ..., -0.0363376 ,
         0.02186033,  0.0786611 ],
       [ 0.06611461,  0.08381408,  0.05248664, ...,  0.00575512,
         0.03067704,  0.0590263 ],
       [ 0.06965446,  0.09446813,  0.06586546, ...,  0.00319912,
         0.02938722,  0.10359237]], dtype=float32)

In [27]:
# 1번째 영화 가중치 50개(50차원) 확인
print(len(movie_weights[0]))   # embedding_size = 50
movie_weights[0]

50


array([ 0.18828699,  0.0954465 ,  0.02271672,  0.03166546,  0.01037227,
        0.01709345, -0.04894662, -0.03953864, -0.08079079, -0.03462999,
       -0.02708721, -0.0923497 , -0.03873277, -0.04508706, -0.11291026,
        0.0255024 ,  0.06899352,  0.04648621, -0.12489656, -0.11561649,
        0.11165787, -0.05208099,  0.07327088, -0.07581313, -0.1083717 ,
       -0.0114217 ,  0.07078306, -0.05842604,  0.13036528, -0.12408369,
       -0.08509454, -0.052406  , -0.01392199,  0.02290837,  0.08302635,
       -0.16821112, -0.03381428,  0.05915373,  0.01530839, -0.15198536,
        0.07198103,  0.00606495,  0.0403566 ,  0.02727726, -0.03049658,
        0.07734001, -0.02998924, -0.12425935,  0.0759232 , -0.04206158],
      dtype=float32)

### 가중치 정규화

In [28]:
movie_lengths = np.linalg.norm(movie_weights, axis=1)

print(len(movie_lengths))
movie_lengths

10000


array([0.5581048 , 0.54736435, 0.5591471 , ..., 0.46584463, 0.4428032 ,
       0.4675038 ], dtype=float32)

# ★임베딩 공간(normalized_movies) 생성!

In [29]:
normalized_movies = (movie_weights.T / movie_lengths).T

# 5. 특정 영화를 입력하면 그 영화와 유사한 영화 추천

In [30]:
def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])    # 내적(dot) : 두 벡터의 코사인 유사도
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

In [42]:
movie_to_idx

{'Deadpool (film)': 0,
 'The Revenant (2015 film)': 1,
 'Suicide Squad (film)': 2,
 'Spectre (2015 film)': 3,
 'Rebel Without a Cause': 4,
 'Warcraft (film)': 5,
 'The Martian (film)': 6,
 'List of Marvel Cinematic Universe films': 7,
 'X-Men (film series)': 8,
 'The Hateful Eight': 9,
 'The Jungle Book (2016 film)': 10,
 'The Big Short (film)': 11,
 '10 Cloverfield Lane': 12,
 'Spotlight (film)': 13,
 'Room (2015 film)': 14,
 'Creed (film)': 15,
 'DC Universe Animated Original Movies': 16,
 'Star Trek Beyond': 17,
 'Star Wars (film)': 18,
 'Interstellar (film)': 19,
 'Ant-Man (film)': 20,
 'Everest (2015 film)': 21,
 'Jurassic World': 22,
 'Joy (film)': 23,
 'Gods of Egypt (film)': 24,
 'Star Wars sequel trilogy': 25,
 'The Conjuring 2': 26,
 'The Danish Girl (film)': 27,
 'Sicario (2015 film)': 28,
 'Rogue One': 29,
 'Finding Dory': 30,
 'Black Mass (film)': 31,
 'Blade Runner': 32,
 'Harry Potter (film series)': 33,
 'Doctor Strange (film)': 34,
 'Titanic (1997 film)': 35,
 'Furious

In [43]:
similar_movies('V for Vendetta (film)')

167 V for Vendetta (film) 1.0
4511 Doomsday (2008 film) 0.84467274
1274 The Imaginarium of Doctor Parnassus 0.840706
1754 The Brothers Grimm (film) 0.8372403
323 Cloud Atlas (film) 0.82835996
1048 Alexander (2004 film) 0.8265726
499 Sherlock Holmes (2009 film) 0.8239833
2575 Ninja Assassin 0.82383156
1029 The League of Extraordinary Gentlemen (film) 0.82243913
666 Valkyrie (film) 0.82238734


In [63]:
similar_movies('300 (film)')

171 300 (film) 1.0
37 Avatar (2009 film) 0.92007726
101 Prometheus (2012 film) 0.91874343
118 Watchmen (film) 0.91744864
19 Interstellar (film) 0.9115615
1706 Speed Racer (film) 0.90926826
181 Pacific Rim (film) 0.90657055
757 Wanted (2008 film) 0.9039165
85 Inception 0.90318435
413 Superman Returns 0.90287733


In [41]:
similar_movies('Doctor Strange (film)')

34 Doctor Strange (film) 1.0000001
39 Guardians of the Galaxy (film) 0.9877984
42 The Avengers (2012 film) 0.986025
7 List of Marvel Cinematic Universe films 0.9836577
162 Thor (film) 0.9801706
1364 Captain America: Civil War 0.9718782
182 The Amazing Spider-Man 2 0.9702531
200 The Incredible Hulk (film) 0.96582836
0 Deadpool (film) 0.9652128
20 Ant-Man (film) 0.9625648


# 6. 영화 추천시스템 만들기 with SVM

In [80]:
best = ['Doctor Strange (film)', 'Guardians of the Galaxy (film)', '300 (film)', 'Interstellar (film)',
        'V for Vendetta (film)', 'Ninja Assassin', 'Deadpool (film)']
# 레이블 1

worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']
# 레이블 0

In [81]:
print(len(best))
print(len(worst))

7
9


In [82]:
y = np.asarray([1 for _ in best] + [0 for _ in worst])

print(len(y))
y

16


array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [83]:
# 영화 16개의 임베딩 벡터들
X = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best + worst])

print(X.shape)
X

(16, 50)


array([[ 3.44489008e-01,  1.74630359e-01,  4.30203080e-02,
         4.80438322e-02,  3.00323199e-02,  6.44130856e-02,
        -3.79585959e-02, -1.26262471e-01, -1.26207486e-01,
         4.43839394e-02, -1.53646143e-02, -2.11081922e-01,
        -3.04684951e-03, -2.98171472e-02, -1.67504415e-01,
         7.68497959e-02,  9.24959406e-02,  1.01707295e-01,
        -1.86316490e-01, -2.11675286e-01,  2.53309399e-01,
        -1.42148957e-01,  1.38957232e-01, -8.37395862e-02,
        -1.81000948e-01, -1.19985789e-02,  1.04705945e-01,
        -8.79282504e-02,  2.54161298e-01, -2.34076723e-01,
        -1.85815975e-01, -1.16164066e-01, -4.78845909e-02,
         9.86093655e-02,  1.40332922e-01, -2.88345844e-01,
        -5.56377247e-02,  1.07346699e-01, -1.73079744e-02,
        -2.41161034e-01,  1.34753272e-01,  8.60243812e-02,
         5.97874150e-02,  6.39533922e-02, -7.31249824e-02,
         1.07616656e-01,  1.52715966e-02, -2.26673692e-01,
         6.24272935e-02, -7.04835579e-02],
       [ 3.61

## SVM 분류기를 구성하고 훈련하기

In [37]:
clf = svm.SVC(kernel='linear')
clf.fit(X, y)

In [84]:
estimated_movie_ratings = clf.decision_function(normalized_movies)  # clf(SVM) 모델에 데이터(normalized_movies: 50차원의 벡터)를 넣어서 
best = np.argsort(estimated_movie_ratings)

print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])

best:
481 The Devil Wears Prada (film) 1.0165867384997302
70 Carol (film) 0.9997925124906684
939 Changeling (film) 0.9887091264852005
458 Hugo (film) 0.9778940647594019
3349 Star Wars: The Force Awakens 0.9656437209916298
worst:
1859 Police Academy (franchise) -1.8362179354697121
7889 The Comebacks -1.8284330730012066
4442 New York Minute (film) -1.7746604163695368
4487 Tremors (franchise) -1.771869352744374
5902 Pauly Shore Is Dead -1.7619351863357864


In [None]:
이전에 케라스로 학습한 임베딩 공간을 기반으로 좋은 영화, 나쁜 영화를 구별한다.
SVM은 positive 예제와 negative 예제로 분리하는 하나 이상의 초평면(hyperspace)을 발견한다.
초평면에서 오른쪽에서 가장 멀리 있는 영화는 최고로 좋아하는 영화이다.