# word2vec, SVM을 이용한 영화 추천시스템

In [1]:
import json
from collections import Counter
import numpy as np
import random

# keras -> Embedding
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot

# sklearn -> LinearRegression
from sklearn.linear_model import LinearRegression

# SVM 분류기
from sklearn import svm



# 1. 데이터 불러오기

In [2]:
with open('wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

## 데이터 확인 : 총 10,000개의 영화, list와 딕셔너리 구조)

In [3]:
print(len(movies))
print(type(movies))

10000
<class 'list'>


In [4]:
movies[0]

['Deadpool (film)',
 {'image': 'Deadpool poster.jpg',
  'name': 'Deadpool',
  'cinematography': 'Ken Seng',
  'Software Used': 'Adobe Premier Pro',
  'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
  'distributor': '20th Century Fox',
  'caption': 'Theatrical release poster',
  'gross': '$783.1 million',
  'country': 'United States',
  'director': 'Tim Miller',
  'runtime': '108 minutes',
  'editing': 'Julian Clarke',
  'language': 'English',
  'music': 'Tom Holkenborg',
  'budget': '$58 million'},
 ['Tim Miller (director)',
  'Simon Kinberg',
  'Ryan Reynolds',
  'Lauren Shuler Donner',
  'Rhett Reese',
  'Paul Wernick',
  'Deadpool',
  'Fabian Nicieza',
  'Rob Liefeld',
  'Morena Baccarin',
  'Ed Skrein',
  'T.J. Miller',
  'Gina Carano',
  'Leslie Uggams',
  'Brianna Hildebrand',
  'Stefan Kapičić',
  'Junkie

In [5]:
movies[:3]

[['Deadpool (film)',
  {'image': 'Deadpool poster.jpg',
   'name': 'Deadpool',
   'cinematography': 'Ken Seng',
   'Software Used': 'Adobe Premier Pro',
   'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
   'distributor': '20th Century Fox',
   'caption': 'Theatrical release poster',
   'gross': '$783.1 million',
   'country': 'United States',
   'director': 'Tim Miller',
   'runtime': '108 minutes',
   'editing': 'Julian Clarke',
   'language': 'English',
   'music': 'Tom Holkenborg',
   'budget': '$58 million'},
  ['Tim Miller (director)',
   'Simon Kinberg',
   'Ryan Reynolds',
   'Lauren Shuler Donner',
   'Rhett Reese',
   'Paul Wernick',
   'Deadpool',
   'Fabian Nicieza',
   'Rob Liefeld',
   'Morena Baccarin',
   'Ed Skrein',
   'T.J. Miller',
   'Gina Carano',
   'Leslie Uggams',
   'Brianna Hildebrand'

## 첫번째 영화 Deadpool 구조 분석

In [6]:
# 첫번째 영화는 Deadpool
movies[0]

['Deadpool (film)',
 {'image': 'Deadpool poster.jpg',
  'name': 'Deadpool',
  'cinematography': 'Ken Seng',
  'Software Used': 'Adobe Premier Pro',
  'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
  'distributor': '20th Century Fox',
  'caption': 'Theatrical release poster',
  'gross': '$783.1 million',
  'country': 'United States',
  'director': 'Tim Miller',
  'runtime': '108 minutes',
  'editing': 'Julian Clarke',
  'language': 'English',
  'music': 'Tom Holkenborg',
  'budget': '$58 million'},
 ['Tim Miller (director)',
  'Simon Kinberg',
  'Ryan Reynolds',
  'Lauren Shuler Donner',
  'Rhett Reese',
  'Paul Wernick',
  'Deadpool',
  'Fabian Nicieza',
  'Rob Liefeld',
  'Morena Baccarin',
  'Ed Skrein',
  'T.J. Miller',
  'Gina Carano',
  'Leslie Uggams',
  'Brianna Hildebrand',
  'Stefan Kapičić',
  'Junkie

In [6]:
# Deadpool의 영화 제목
movies[0][0]

'Deadpool (film)'

In [7]:
# Deadpool의 영화 개요
movies[0][1]

{'image': 'Deadpool poster.jpg',
 'name': 'Deadpool',
 'cinematography': 'Ken Seng',
 'Software Used': 'Adobe Premier Pro',
 'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
 'distributor': '20th Century Fox',
 'caption': 'Theatrical release poster',
 'gross': '$783.1 million',
 'country': 'United States',
 'director': 'Tim Miller',
 'runtime': '108 minutes',
 'editing': 'Julian Clarke',
 'language': 'English',
 'music': 'Tom Holkenborg',
 'budget': '$58 million'}

In [8]:
# Deadpool의 외부링크(태그)들 - 총 405개
print(len(movies[0][2]))
movies[0][2]

405


['Tim Miller (director)',
 'Simon Kinberg',
 'Ryan Reynolds',
 'Lauren Shuler Donner',
 'Rhett Reese',
 'Paul Wernick',
 'Deadpool',
 'Fabian Nicieza',
 'Rob Liefeld',
 'Morena Baccarin',
 'Ed Skrein',
 'T.J. Miller',
 'Gina Carano',
 'Leslie Uggams',
 'Brianna Hildebrand',
 'Stefan Kapičić',
 'Junkie XL',
 'Julian Clarke',
 'Marvel Entertainment',
 'Kinberg Genre',
 'Lauren Shuler Donner',
 'TSG Entertainment',
 '20th Century Fox',
 'Le Grand Rex',
 'Variety (magazine)',
 'Box Office Mojo',
 'superhero film',
 'Tim Miller (director)',
 'Rhett Reese',
 'Paul Wernick',
 'Marvel Comics',
 'Deadpool',
 'X-Men (film series)',
 'Ryan Reynolds',
 'Morena Baccarin',
 'Ed Skrein',
 'T.J. Miller',
 'Gina Carano',
 'Leslie Uggams',
 'Brianna Hildebrand',
 'Stefan Kapičić',
 'antihero',
 'New Line Cinema',
 '20th Century Fox',
 'X-Men Origins: Wolverine',
 'principal photography',
 'Vancouver',
 'IMAX',
 'Digital Light Processing',
 'D-Box Technologies',
 'List of accolades received by Deadpool (

In [10]:
# 평점
movies[0][4]

'6.9/10'

## 외부 링크(태그) 소개

In [None]:
link_counts = Counter()

for movie in movies:
    link_counts.update(movie[2])  # 외부 링크 다 더하기
                                  # Counter 클래스의 update 메소드는 딕셔너리 값들을 더해준다.
        

In [16]:
link_counts

Counter({'Tim Miller (director)': 9,
         'Simon Kinberg': 33,
         'Ryan Reynolds': 114,
         'Lauren Shuler Donner': 37,
         'Rhett Reese': 12,
         'Paul Wernick': 11,
         'Deadpool': 9,
         'Fabian Nicieza': 1,
         'Rob Liefeld': 2,
         'Morena Baccarin': 16,
         'Ed Skrein': 18,
         'T.J. Miller': 12,
         'Gina Carano': 26,
         'Leslie Uggams': 9,
         'Brianna Hildebrand': 8,
         'Stefan Kapičić': 6,
         'Junkie XL': 26,
         'Julian Clarke': 11,
         'Marvel Entertainment': 28,
         'Kinberg Genre': 6,
         'TSG Entertainment': 37,
         '20th Century Fox': 1147,
         'Le Grand Rex': 12,
         'Variety (magazine)': 5450,
         'Box Office Mojo': 4186,
         'superhero film': 91,
         'Marvel Comics': 258,
         'X-Men (film series)': 16,
         'antihero': 18,
         'New Line Cinema': 408,
         'X-Men Origins: Wolverine': 15,
         'principal photography'

In [17]:
# 가장 수가 많은 태그 순으로 정렬
link_counts.most_common(10)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867),
 ('Variety (magazine)', 5450),
 ('Metacritic', 5112),
 ('Box Office Mojo', 4186),
 ('The New York Times', 3818),
 ('The Hollywood Reporter', 3553),
 ('Roger Ebert', 2707),
 ('Los Angeles Times', 2454)]

# 2. 데이터 전처리 및 피쳐 엔지니어링

In [18]:
# 외부 링크 빈도수가 3번 이상인 것들만 뽑아서 top_links 변수에 저장
top_links = [link for link, c in link_counts.items() if c >= 3]

top_links[:10]

['Tim Miller (director)',
 'Simon Kinberg',
 'Ryan Reynolds',
 'Lauren Shuler Donner',
 'Rhett Reese',
 'Paul Wernick',
 'Deadpool',
 'Morena Baccarin',
 'Ed Skrein',
 'T.J. Miller']

## 나중에 조회를 위해 (링크, 영화) 쌍  작성

In [19]:
# 총 66,913개 길이의 "링크:인덱스 딕셔너리(link_to_idx)"" 만들기
link_to_idx = {link: idx for idx, link in enumerate(top_links)}

print(len(link_to_idx))
link_to_idx

66913


{'Tim Miller (director)': 0,
 'Simon Kinberg': 1,
 'Ryan Reynolds': 2,
 'Lauren Shuler Donner': 3,
 'Rhett Reese': 4,
 'Paul Wernick': 5,
 'Deadpool': 6,
 'Morena Baccarin': 7,
 'Ed Skrein': 8,
 'T.J. Miller': 9,
 'Gina Carano': 10,
 'Leslie Uggams': 11,
 'Brianna Hildebrand': 12,
 'Stefan Kapičić': 13,
 'Junkie XL': 14,
 'Julian Clarke': 15,
 'Marvel Entertainment': 16,
 'Kinberg Genre': 17,
 'TSG Entertainment': 18,
 '20th Century Fox': 19,
 'Le Grand Rex': 20,
 'Variety (magazine)': 21,
 'Box Office Mojo': 22,
 'superhero film': 23,
 'Marvel Comics': 24,
 'X-Men (film series)': 25,
 'antihero': 26,
 'New Line Cinema': 27,
 'X-Men Origins: Wolverine': 28,
 'principal photography': 29,
 'Vancouver': 30,
 'IMAX': 31,
 'D-Box Technologies': 32,
 'Golden Globe Award': 33,
 'Golden Globe Award for Best Motion Picture – Musical or Comedy': 34,
 'Golden Globe Award for Best Actor – Motion Picture Musical or Comedy': 35,
 'Producers Guild of America Award': 36,
 "Critics' Choice Movie Awards

In [21]:
# 총 1만개 영화의 "무비:인덱스 딕셔너리(movie_to_idx)" 만들기
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}

print(len(movie_to_idx))
movie_to_idx

10000


{'Deadpool (film)': 0,
 'The Revenant (2015 film)': 1,
 'Suicide Squad (film)': 2,
 'Spectre (2015 film)': 3,
 'Rebel Without a Cause': 4,
 'Warcraft (film)': 5,
 'The Martian (film)': 6,
 'List of Marvel Cinematic Universe films': 7,
 'X-Men (film series)': 8,
 'The Hateful Eight': 9,
 'The Jungle Book (2016 film)': 10,
 'The Big Short (film)': 11,
 '10 Cloverfield Lane': 12,
 'Spotlight (film)': 13,
 'Room (2015 film)': 14,
 'Creed (film)': 15,
 'DC Universe Animated Original Movies': 16,
 'Star Trek Beyond': 17,
 'Star Wars (film)': 18,
 'Interstellar (film)': 19,
 'Ant-Man (film)': 20,
 'Everest (2015 film)': 21,
 'Jurassic World': 22,
 'Joy (film)': 23,
 'Gods of Egypt (film)': 24,
 'Star Wars sequel trilogy': 25,
 'The Conjuring 2': 26,
 'The Danish Girl (film)': 27,
 'Sicario (2015 film)': 28,
 'Rogue One': 29,
 'Finding Dory': 30,
 'Black Mass (film)': 31,
 'Blade Runner': 32,
 'Harry Potter (film series)': 33,
 'Doctor Strange (film)': 34,
 'Titanic (1997 film)': 35,
 'Furious

In [22]:
# (링크:영화) 쌍을 pairs 변수에 저장
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
    
# 0번째 영화와 1번째 영화의 링크 경계선 확인
pairs[350:360]

# 0번째 영화에는 355개의 링크(빈도수 3 이상)가 있다.

[(215, 0),
 (216, 0),
 (217, 0),
 (218, 0),
 (219, 0),
 (220, 0),
 (221, 1),
 (222, 1),
 (223, 1),
 (224, 1)]

In [23]:
# 중복값 제거
pairs_set = set(pairs)

In [25]:
print(len(pairs))          # pairs : "링크:영화"쌍 94만개 리스트
print(len(top_links))      # top_links : "링크" 수 6만개(빈도수 3이상) 리스트
print(len(movie_to_idx))   # movie_to_idx : 영화 수 1만개 딕셔너리

949544
66913
10000


## -> 데이터 전처리, 피쳐 엔지니어링 끝

# 3. 케라스 모델링 : 임베딩 학습

In [31]:
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot

# 임베딩 모델링하는 함수
def movie_embedding_model(embedding_size=50):                        # 임베딩 사이즈 = 50
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    
    link_embedding = Embedding(name='link_embedding',                # 임베딩 벡터1 - link_embedding
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)   
    
    movie_embedding = Embedding(name='movie_embedding',              # 임베딩 벡터2 - movie_embedding
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)
    
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])  # 두 임베딩 벡터의 내적
    
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')                     # 모델 컴파일 : optimizer=nadam, 손실함수=mse
    return model                                                    # 모델을 리턴한다.

model = movie_embedding_model()
model.summary()
# top_links x 임베딩 차원수 = 66913 x 50 = 3345650
# 영화 수 x 임베딩 차원수 = 10000 x 50 = 500000 

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 link (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 movie (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 link_embedding (Embedding)     (None, 1, 50)        3345650     ['link[0][0]']                   
                                                                                                  
 movie_embedding (Embedding)    (None, 1, 50)        500000      ['movie[0][0]']                  
                                                                                            

# 4. 모델 훈련

In [52]:
random.seed(5)

# batchifier(생성기)로 모델에 데이터를 넣어줌.
# positive(레이블 1), negative(레이블 0) 데이터를 섞어서 batch를 만듦

# (참고)yield는 값을 보내줄 때 사용. return과 다른 점은 함수의 종료가 아닌 지속적으로 보내줄 수 있음.
# 이 떄 값을 받을 수 있는 방식이 next
def batchifier(pairs, positive_samples=50, negative_ratio=5): # 쌍 배열에서 positive 예제로 샘플링한 다음 negative 예제로 채운다.
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:           # pairs_set에 없는지 확인
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

In [60]:
# 실행할 때마다 batch를 내보냄
next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([12068., 63411., 42155., 50485., 44559., 41523.,  5440., 19512.,
         23262.]),
  'movie': array([4516., 5067., 4105., 2478., 2428., 5119., 5357., 1281., 5016.])},
 array([-1., -1.,  1.,  1., -1., -1.,  1., -1., -1.]))

### 딥러닝 학습

In [33]:
%%time
# 학습시간 약 8분
positive_samples_per_batch = 512

# 위에서 정의한 임베딩 모델에 딥러닝 학습시키기
model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=5),
    epochs=10,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2)



Epoch 1/10
1854/1854 - 50s - loss: 0.5243 - 50s/epoch - 27ms/step
Epoch 2/10
1854/1854 - 50s - loss: 0.2326 - 50s/epoch - 27ms/step
Epoch 3/10
1854/1854 - 50s - loss: 0.2218 - 50s/epoch - 27ms/step
Epoch 4/10
1854/1854 - 50s - loss: 0.2180 - 50s/epoch - 27ms/step
Epoch 5/10
1854/1854 - 50s - loss: 0.2159 - 50s/epoch - 27ms/step
Epoch 6/10
1854/1854 - 51s - loss: 0.2147 - 51s/epoch - 28ms/step
Epoch 7/10
1854/1854 - 53s - loss: 0.2140 - 53s/epoch - 29ms/step
Epoch 8/10
1854/1854 - 51s - loss: 0.2134 - 51s/epoch - 28ms/step
Epoch 9/10
1854/1854 - 51s - loss: 0.2131 - 51s/epoch - 27ms/step
Epoch 10/10
1854/1854 - 53s - loss: 0.2127 - 53s/epoch - 28ms/step
CPU times: user 33min 21s, sys: 10min 9s, total: 43min 31s
Wall time: 8min 29s


<keras.callbacks.History at 0x7fcb11676bb0>

In [34]:
# 영화 임베딩 레이어 확인
movie = model.get_layer('movie_embedding')
movie

<keras.layers.embeddings.Embedding at 0x7fcb12859a90>

### 코사인 유사도 계산

In [40]:
# 영화 별 가중치
movie_weights = movie.get_weights()[0]

In [46]:
print(len(movie_weights))   # 영화 10000개 각각에 대한 가중치
movie_weights      

10000


array([[ 0.06617867,  0.18108194,  0.17160226, ...,  0.15171473,
        -0.03601742, -0.06914794],
       [ 0.03340227,  0.15683025,  0.09986706, ...,  0.08520829,
        -0.01812719,  0.01802831],
       [ 0.02019274,  0.17459679,  0.16942517, ...,  0.1625094 ,
        -0.03954628, -0.03014349],
       ...,
       [ 0.06485631,  0.16336462,  0.1005133 , ...,  0.10422818,
         0.0778826 , -0.07411923],
       [ 0.07528549,  0.15119506,  0.06203917, ...,  0.05846865,
         0.1132488 , -0.06544591],
       [ 0.08835103,  0.1420988 ,  0.07070374, ...,  0.04315339,
         0.14374654, -0.08310198]], dtype=float32)

In [44]:
# 1번째 영화 가중치 50개(50차원) 확인
print(len(movie_weights[0]))   # embedding_size = 50
movie_weights[0]

50


array([ 0.06617867,  0.18108194,  0.17160226, -0.19826612, -0.091964  ,
        0.03082942, -0.06895228, -0.04160095, -0.0370773 ,  0.13127455,
        0.05556502, -0.07941396,  0.09337802, -0.04915227,  0.04522948,
        0.07133889, -0.07393897,  0.03048602, -0.07705442, -0.05933971,
        0.11114539, -0.11812776, -0.09116898, -0.03535625,  0.02121077,
        0.03433691, -0.14234959,  0.07730503,  0.03745128, -0.13375758,
       -0.0275453 ,  0.00768218,  0.06205039,  0.05171747, -0.05121757,
        0.11151362,  0.03235075,  0.08642712, -0.02445087,  0.13204262,
       -0.04039172, -0.13175978, -0.08591748, -0.14841698, -0.0043093 ,
        0.08087662, -0.07444651,  0.15171473, -0.03601742, -0.06914794],
      dtype=float32)

### 가중치 정규화

In [47]:
movie_lengths = np.linalg.norm(movie_weights, axis=1)

print(len(movie_lengths))
movie_lengths

10000


array([0.63681114, 0.63134366, 0.6612358 , ..., 0.570116  , 0.55660385,
       0.5602568 ], dtype=float32)

# ★임베딩 공간(normalized_movies) 생성!

In [48]:
normalized_movies = (movie_weights.T / movie_lengths).T

# 5. 특정 영화를 입력하면 그 영화와 유사한 영화 추천

In [49]:
def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])    # 내적(dot) : 두 벡터의 코사인 유사도
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

In [50]:
similar_movies('Iron Man (2008 film)')

195 Iron Man (2008 film) 1.0
353 Iron Man 2 0.9912686
478 Spider-Man 3 0.9897937
143 Iron Man 3 0.9862234
200 The Incredible Hulk (film) 0.9840802
1159 Cowboys & Aliens 0.97663164
22 Jurassic World 0.976036
162 Thor (film) 0.97574747
118 Watchmen (film) 0.97490364
613 Terminator Salvation 0.9748526


In [51]:
similar_movies('Avatar (2009 film)')

37 Avatar (2009 film) 1.0000001
154 Star Trek (film) 0.9681996
613 Terminator Salvation 0.96175003
1159 Cowboys & Aliens 0.95987195
1085 Planet of the Apes (2001 film) 0.95745754
19 Interstellar (film) 0.95408636
413 Superman Returns 0.9540835
225 Jurassic Park 0.95303965
195 Iron Man (2008 film) 0.9509183
353 Iron Man 2 0.9508791


# 6. 영화 추천시스템 만들기 with SVM

In [67]:
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
# 레이블 1

worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']
# 레이블 0

In [68]:
print(len(best))
print(len(worst))

7
9


In [69]:
y = np.asarray([1 for _ in best] + [0 for _ in worst])

print(len(y))
y

16


array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [70]:
# 영화 16개의 임베딩 벡터들
X = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best + worst])

print(X.shape)
X

(16, 50)


array([[ 7.03025982e-02,  1.73560679e-01,  1.87281728e-01,
        -3.26460838e-01, -1.67390808e-01,  1.06128931e-01,
        -1.78782552e-01,  5.57120889e-02,  7.64613897e-02,
         2.00567007e-01,  1.72205850e-01, -1.52502999e-01,
         4.05397266e-02,  3.05538997e-03, -3.22131626e-03,
         9.13476795e-02, -2.20293850e-02,  8.00844282e-02,
        -9.35436711e-02, -1.83451787e-01,  1.08848974e-01,
        -1.02554984e-01, -6.29979894e-02, -5.89077435e-02,
         1.26792043e-01,  1.01444699e-01, -1.36161447e-01,
         1.62375599e-01,  1.42394006e-01, -2.68386811e-01,
        -1.02332786e-01,  2.58358791e-02,  1.07216395e-01,
         4.91527952e-02, -1.12892732e-01,  1.81596413e-01,
         1.37053907e-01,  2.46532798e-01,  1.38518251e-02,
         1.54243737e-01, -4.26526703e-02, -1.81437135e-01,
        -1.38374731e-01, -2.34150752e-01,  1.28977835e-01,
         1.60357311e-01, -6.65711835e-02,  2.20288053e-01,
        -1.35488763e-01, -2.35550813e-02],
       [ 6.71

## SVM 분류기를 구성하고 훈련하기

In [71]:
clf = svm.SVC(kernel='linear')
clf.fit(X, y)

SVC(kernel='linear')

In [74]:
estimated_movie_ratings = clf.decision_function(normalized_movies)  # clf(SVM) 모델에 데이터(normalized_movies: 50차원의 벡터)를 넣어서 
best = np.argsort(estimated_movie_ratings)

print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])

best:
66 Skyfall 1.3030655886531592
481 The Devil Wears Prada (film) 1.1506323103160825
458 Hugo (film) 1.0654984187433394
307 Les Misérables (2012 film) 1.0405691886180661
2660 Girl with a Pearl Earring (film) 1.0252827748959223
worst:
9595 Speed Zone -1.6581940760728378
1859 Police Academy (franchise) -1.6478712237527597
3073 Joe Dirt -1.6154952357874155
6565 Son in Law -1.5803621575945304
5069 Black Sheep (1996 film) -1.577984248084364


In [None]:
이전에 케라스로 학습한 임베딩 공간을 기반으로 좋은 영화, 나쁜 영화를 구별한다.
SVM은 positive 예제와 negative 예제로 분리하는 하나 이상의 초평면(hyperspace)을 발견한다.
초평면에서 오른쪽에서 가장 멀리 있는 영화는 최고로 좋아하는 영화이다.