In [1]:
import json
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn import svm

Using TensorFlow backend.


In [2]:
with open('data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

In [3]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(10)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867),
 ('Variety (magazine)', 5450),
 ('Metacritic', 5112),
 ('Box Office Mojo', 4186),
 ('The New York Times', 3818),
 ('The Hollywood Reporter', 3553),
 ('Roger Ebert', 2707),
 ('Los Angeles Times', 2454)]

In [4]:
top_links = [link for link, c in link_counts.items() if c >= 3]
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)

(949544, 66913, 10000)

In [5]:
link = Input(name='link', shape=(1,))
movie = Input(name='movie', shape=(1,))

In [6]:
link

<tf.Tensor 'link:0' shape=(?, 1) dtype=float32>

In [7]:
movie

<tf.Tensor 'movie:0' shape=(?, 1) dtype=float32>

In [8]:
embedding_size=50

In [9]:
link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)

In [10]:
link_embedding

<tf.Tensor 'link_embedding/embedding_lookup:0' shape=(?, 1, 50) dtype=float32>

In [11]:
movie_embedding = Embedding(name='movie_embedding', 
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)

In [12]:
movie_embedding

<tf.Tensor 'movie_embedding/embedding_lookup:0' shape=(?, 1, 50) dtype=float32>

In [13]:
dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])

In [14]:
dot

<tf.Tensor 'dot_product/MatMul:0' shape=(?, 1, 1) dtype=float32>

In [15]:
merged = Reshape((1,))(dot)

In [16]:
merged

<tf.Tensor 'reshape_1/Reshape:0' shape=(?, 1) dtype=float32>

In [17]:
model = Model(inputs=[link, movie], outputs=[merged])

In [18]:
model

<keras.engine.training.Model at 0x1752878b908>

In [19]:
def movie_embedding_model(embedding_size=50):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', 
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

model = movie_embedding_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
link (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
movie (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
link_embedding (Embedding)      (None, 1, 50)        3345650     link[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 50)        500000      movie[0][0]                      
__________________________________________________________________________________________________
dot_produc

In [20]:
positive_samples=3
negative_ratio=2

In [21]:
batch_size = positive_samples * (1 + negative_ratio)
batch = np.zeros((batch_size, 3))

In [22]:
batch_size

9

In [23]:
batch

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [24]:
random.seed(5)
for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
    batch[idx, :] = (link_id, movie_id, 1)
idx = positive_samples
while idx < batch_size:
    movie_id = random.randrange(len(movie_to_idx))
    link_id = random.randrange(len(top_links))
    if not (link_id, movie_id) in pairs_set:
        batch[idx, :] = (link_id, movie_id, -1)
        idx += 1

In [25]:
random.seed(5)
random.sample(pairs, positive_samples)

[(31254, 5530), (22418, 1529), (1313, 7236)]

In [26]:
batch

array([[ 3.1254e+04,  5.5300e+03,  1.0000e+00],
       [ 2.2418e+04,  1.5290e+03,  1.0000e+00],
       [ 1.3130e+03,  7.2360e+03,  1.0000e+00],
       [ 3.8010e+03,  5.8740e+03, -1.0000e+00],
       [ 3.2643e+04,  7.6280e+03, -1.0000e+00],
       [ 2.0558e+04,  8.4900e+02, -1.0000e+00],
       [ 4.8731e+04,  1.8540e+03, -1.0000e+00],
       [ 3.2318e+04,  7.6850e+03, -1.0000e+00],
       [ 1.3365e+04,  6.2380e+03, -1.0000e+00]])

In [27]:
np.random.shuffle(batch)

In [28]:
batch

array([[ 3.1254e+04,  5.5300e+03,  1.0000e+00],
       [ 3.2643e+04,  7.6280e+03, -1.0000e+00],
       [ 3.2318e+04,  7.6850e+03, -1.0000e+00],
       [ 1.3365e+04,  6.2380e+03, -1.0000e+00],
       [ 4.8731e+04,  1.8540e+03, -1.0000e+00],
       [ 3.8010e+03,  5.8740e+03, -1.0000e+00],
       [ 2.2418e+04,  1.5290e+03,  1.0000e+00],
       [ 2.0558e+04,  8.4900e+02, -1.0000e+00],
       [ 1.3130e+03,  7.2360e+03,  1.0000e+00]])

In [36]:
random.seed(5)

def batchifier(pairs, positive_samples=50, negative_ratio=10):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([ 1077., 16606., 20919., 18188., 23688., 53497., 23865.,   232.,
          3718.]),
  'movie': array([1476., 7288., 6380., 1178., 4858., 3550., 4579., 2166.,   39.])},
 array([ 1., -1., -1., -1.,  1., -1., -1., -1.,  1.]))

In [123]:
next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([53497.,  3718.,  1077., 23865., 18188., 23688., 16606., 20919.,
           232.]),
  'movie': array([3550.,   39., 1476., 4579., 1178., 4858., 7288., 6380., 2166.])},
 array([-1.,  1.,  1., -1., -1.,  1., -1., -1., -1.]))

In [124]:
positive_samples_per_batch = 512

model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=10),
    epochs=15,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2
)

Epoch 1/15
 - 61s - loss: 0.3825
Epoch 2/15
 - 56s - loss: 0.2386
Epoch 3/15
 - 55s - loss: 0.2302
Epoch 4/15
 - 56s - loss: 0.2322
Epoch 5/15
 - 56s - loss: 0.2289
Epoch 6/15
 - 56s - loss: 0.2302
Epoch 7/15
 - 57s - loss: 0.2270
Epoch 8/15
 - 56s - loss: 0.2221
Epoch 9/15
 - 56s - loss: 0.2237
Epoch 10/15
 - 57s - loss: 0.2291
Epoch 11/15
 - 57s - loss: 0.2313
Epoch 12/15
 - 57s - loss: 0.2307
Epoch 13/15
 - 56s - loss: 0.2295
Epoch 14/15
 - 56s - loss: 0.2272
Epoch 15/15
 - 55s - loss: 0.2266


<keras.callbacks.History at 0x21dd3222240>

In [130]:
movie = model.get_layer('movie_embedding')
movie

<keras.layers.embeddings.Embedding at 0x21de9a125f8>

In [149]:
movie_weights = movie.get_weights()[0]
movie_weights

array([[ 0.26379415, -0.22333774, -0.0331103 , ...,  0.18464315,
        -0.02784295,  0.74615806],
       [ 0.09846909,  0.06137437,  0.16375858, ...,  0.25589433,
         0.11824913,  0.7178061 ],
       [ 0.30206013, -0.29089686,  0.10767684, ...,  0.18906458,
         0.05846483,  0.7559275 ],
       ...,
       [ 0.2775771 , -0.2711294 ,  0.39649475, ...,  0.14147674,
         0.04158561,  0.43394017],
       [ 0.17378002, -0.05511739,  0.30222908, ...,  0.15900685,
         0.04615842,  0.44815847],
       [ 0.04863823, -0.26323783,  0.47113296, ...,  0.21209224,
        -0.00139167,  0.4352032 ]], dtype=float32)

In [156]:
movie_weights.shape

(10000, 50)

In [157]:
movie_lengths = np.linalg.norm(movie_weights, axis=1)
movie_lengths

array([2.4923828, 2.265339 , 2.4108286, ..., 2.1687808, 2.0850282,
       2.1820896], dtype=float32)

In [158]:
movie_lengths.shape

(10000,)

In [141]:
normalized_movies = (movie_weights.T / movie_lengths).T
normalized_movies.shape

(10000, 50)

In [163]:
dists = np.dot(normalized_movies, normalized_movies[movie_to_idx['Rogue One']])
dists

array([0.8965303 , 0.86427283, 0.88250107, ..., 0.6462473 , 0.5763124 ,
       0.5632654 ], dtype=float32)

In [164]:
dists.shape

(10000,)

In [165]:
closest = np.argsort(dists)[-10:]
closest

array([  37,  181,  659,   86,  372,  245,   25, 3349,   19,   29],
      dtype=int64)

In [166]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('Rogue One')

29 Rogue One 1.0
19 Interstellar (film) 0.9799465
3349 Star Wars: The Force Awakens 0.9721204
25 Star Wars sequel trilogy 0.9651754
245 Gravity (film) 0.9639311
372 The Amazing Spider-Man (2012 film) 0.95943654
86 Tomorrowland (film) 0.9583425
659 Rise of the Planet of the Apes 0.95569175
181 Pacific Rim (film) 0.95549285
37 Avatar (2009 film) 0.9552629


In [167]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lengths = np.linalg.norm(link_weights, axis=1)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link):
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

similar_links('George Lucas')

127 George Lucas 0.9999999
2707 Star Wars 0.9561926
4830 widescreen 0.9381799
3176 Star Wars (film) 0.9367047
976 Hugo Award for Best Dramatic Presentation 0.92060775
2778 Lucasfilm 0.9080174
2931 LaserDisc 0.8984472
2829 storyboard 0.89684695
1732 Academy Award for Best Visual Effects 0.89381385
4051 novelization 0.8928862


In [168]:
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']
y = np.asarray([1 for _ in best] + [0 for _ in worst])
X = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best + worst])
X.shape

(16, 50)

In [169]:
y

array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [172]:
clf = svm.SVC(kernel='linear')
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [175]:
estimated_movie_ratings = clf.decision_function(normalized_movies)
estimated_movie_ratings

array([-0.06298117,  0.52314811, -0.02402084, ..., -1.03483197,
       -0.87033811, -0.88188471])

In [176]:
estimated_movie_ratings.shape

(10000,)

In [177]:
best = np.argsort(estimated_movie_ratings)
best

array([1782, 6565, 5097, ...,  307,   66,  481], dtype=int64)

In [173]:
estimated_movie_ratings = clf.decision_function(normalized_movies)
best = np.argsort(estimated_movie_ratings)
print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])

best:
481 The Devil Wears Prada (film) 1.268666378510092
66 Skyfall 1.250339611102597
307 Les Misérables (2012 film) 1.1020438465534914
458 Hugo (film) 1.1013963012486419
18 Star Wars (film) 1.0508558368266598
worst:
1782 Scooby-Doo! WrestleMania Mystery -1.6151670621457392
6565 Son in Law -1.5452021357874368
5097 Ready to Rumble -1.5390608041800808
8559 Air Buddies -1.5285885448194856
9595 Speed Zone -1.5273252976931886


In [187]:
movies[0]

['Deadpool (film)',
 {'image': 'Deadpool poster.jpg',
  'name': 'Deadpool',
  'cinematography': 'Ken Seng',
  'Software Used': 'Adobe Premier Pro',
  'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
  'distributor': '20th Century Fox',
  'caption': 'Theatrical release poster',
  'gross': '$783.1 million',
  'country': 'United States',
  'director': 'Tim Miller',
  'runtime': '108 minutes',
  'editing': 'Julian Clarke',
  'language': 'English',
  'music': 'Tom Holkenborg',
  'budget': '$58 million'},
 ['Tim Miller (director)',
  'Simon Kinberg',
  'Ryan Reynolds',
  'Lauren Shuler Donner',
  'Rhett Reese',
  'Paul Wernick',
  'Deadpool',
  'Fabian Nicieza',
  'Rob Liefeld',
  'Morena Baccarin',
  'Ed Skrein',
  'T.J. Miller',
  'Gina Carano',
  'Leslie Uggams',
  'Brianna Hildebrand',
  'Stefan Kapičić',
  'Junkie

In [178]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])
rotten_y

array([0.84, 0.82, 0.26, ..., 0.61, 0.88, 0.78])

In [179]:
rotten_y.shape

(5584,)

In [180]:
rotten_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])
rotten_X

array([[ 0.10584015, -0.08960813, -0.0132846 , ...,  0.07408299,
        -0.01117122,  0.29937539],
       [ 0.04346771,  0.0270928 ,  0.07228877, ...,  0.11296073,
         0.05219931,  0.3168648 ],
       [ 0.12529308, -0.12066261,  0.04466383, ...,  0.07842307,
         0.02425093,  0.31355506],
       ...,
       [ 0.14276731, -0.05943922,  0.26689276, ..., -0.00549476,
         0.01635706,  0.2739983 ],
       [-0.01881318, -0.00738083,  0.26481667, ...,  0.06428229,
         0.11268511,  0.28709304],
       [ 0.02228975, -0.12063567,  0.21590908, ...,  0.09719685,
        -0.00063777,  0.19944333]], dtype=float32)

In [181]:
rotten_X.shape

(5584, 50)

In [188]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])
rotten_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])

In [192]:
TRAINING_CUT_OFF = int(len(rotten_X) * 0.8)
TRAINING_CUT_OFF

4467

In [193]:
rotten_X[:TRAINING_CUT_OFF]

array([[ 0.10584015, -0.08960813, -0.0132846 , ...,  0.07408299,
        -0.01117122,  0.29937539],
       [ 0.04346771,  0.0270928 ,  0.07228877, ...,  0.11296073,
         0.05219931,  0.3168648 ],
       [ 0.12529308, -0.12066261,  0.04466383, ...,  0.07842307,
         0.02425093,  0.31355506],
       ...,
       [ 0.04125948, -0.12770237,  0.21334249, ...,  0.14296196,
         0.00181898,  0.24280527],
       [ 0.12008058, -0.09832337,  0.23343502, ...,  0.05394334,
         0.0386282 ,  0.20845838],
       [ 0.12736519, -0.10409003,  0.20142396, ...,  0.06404588,
         0.00995082,  0.22289185]], dtype=float32)

In [194]:
rotten_X[:TRAINING_CUT_OFF].shape

(4467, 50)

In [189]:
TRAINING_CUT_OFF = int(len(rotten_X) * 0.8)
regr = LinearRegression()
regr.fit(rotten_X[:TRAINING_CUT_OFF], rotten_y[:TRAINING_CUT_OFF])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [197]:
rotten_y[:TRAINING_CUT_OFF].shape

(4467,)

In [362]:
rotten_y[TRAINING_CUT_OFF:].shape

(1117,)

In [363]:
error = (regr.predict(rotten_X[TRAINING_CUT_OFF:]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.06'

In [364]:
regr.predict(rotten_X[TRAINING_CUT_OFF:])

array([0.81741446, 0.5978348 , 0.36409116, ..., 0.77334124, 0.67653215,
       0.6384487 ], dtype=float32)

In [365]:
error = (np.mean(rotten_y[:TRAINING_CUT_OFF]) - rotten_y[TRAINING_CUT_OFF:])
error

array([-0.27697784, -0.06697784,  0.19302216, ..., -0.00697784,
       -0.27697784, -0.17697784])

In [366]:
error = (np.mean(rotten_y[:TRAINING_CUT_OFF]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.09'

In [367]:
def gross(movie):
    v = movie[1].get('gross')
    if not v or not ' ' in v:
        return None
    v, unit = v.split(' ', 1)
    unit = unit.lower()
    if not unit in ('million', 'billion'):
        return None
    if not v.startswith('$'):
        return None
    try:
        v = float(v[1:])
    except ValueError:
        return None
    if unit == 'billion':
        v *= 1000
    return v

movie_gross = [gross(m) for m in movies]
movie_gross = np.asarray([gr for gr in movie_gross if gr is not None])
highest = np.argsort(movie_gross)[-10:]
for c in reversed(highest):
    print(c, movies[c][0], movie_gross[c])

6 The Martian (film) 10900.0
7 List of Marvel Cinematic Universe films 4300.0
49 Back to the Future 3900.0
71 The Conjuring 2932.0
162 Thor (film) 2464.0
36 Furious 7 2340.0
30 Finding Dory 2187.0
1906 Jane Eyre (2011 film) 2068.0
19 Interstellar (film) 1670.0
2251 An American Werewolf in London 1655.0


In [368]:
gross_y = np.asarray([gr for gr in movie_gross if gr])
gross_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie, gr in zip(movies, movie_gross) if gr])

In [369]:
TRAINING_CUT_OFF = int(len(gross_X) * 0.8)
regr = LinearRegression()
regr.fit(gross_X[:TRAINING_CUT_OFF], gross_y[:TRAINING_CUT_OFF])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [370]:
error = (regr.predict(gross_X[TRAINING_CUT_OFF:]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 9061.02'

In [371]:
error = (np.mean(gross_y[:TRAINING_CUT_OFF]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 14115.59'