In [1]:
import requests
from bs4 import BeautifulSoup
import os
import time
try:
  from urllib.request import urlretrieve
except ImportError:
  from urllib import urlretrieve
import xml.sax
from sklearn import svm
import subprocess
import mwparserfromhell
import json
from collections import Counter
from itertools import chain
import numpy as np
import random
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot
from sklearn.linear_model import LinearRegression

Using TensorFlow backend.


In [2]:
with open('data/wp_movies_10k.ndjson') as fin:
  movies = [json.loads(l) for l in fin]

In [3]:
link_counts = Counter()
for movie in movies:
  link_counts.update(movie[2])
link_counts.most_common(10)

[(u'Rotten Tomatoes', 9393),
 (u'Category:English-language films', 5882),
 (u'Category:American films', 5867),
 (u'Variety (magazine)', 5450),
 (u'Metacritic', 5112),
 (u'Box Office Mojo', 4186),
 (u'The New York Times', 3818),
 (u'The Hollywood Reporter', 3553),
 (u'Roger Ebert', 2707),
 (u'Los Angeles Times', 2454)]

In [4]:
top_links = [link for link, c in link_counts.items() if c >= 3]
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)

(949544, 66913, 10000)

In [5]:
def movie_embedding_model(embedding_size=30):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding', input_dim=len(top_links), output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', input_dim=len(movie_to_idx), output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

model = movie_embedding_model()

In [6]:
import random
random.seed(5)

def batchifier(pairs, positive_samples=50, negative_ratio=5):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([ 36384.,  31386.,   1940.,  49508.,  22831.,  62614.,  63122.,
          60281.,  44562.]),
  'movie': array([ 2465.,  1132.,  9223.,  9424.,  4734.,  6207.,  4656.,  6489.,
          6901.])},
 array([-1., -1., -1., -1.,  1.,  1., -1., -1.,  1.]))

In [7]:
positive_samples_per_batch=256

model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=5),
    epochs=10,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2
)

Epoch 1/10
83s - loss: 0.5052
Epoch 2/10
129s - loss: 0.3660
Epoch 3/10
139s - loss: 0.3530
Epoch 4/10
143s - loss: 0.3418
Epoch 5/10
107s - loss: 0.3348
Epoch 6/10
102s - loss: 0.3316
Epoch 7/10
92s - loss: 0.3274
Epoch 8/10
95s - loss: 0.3269
Epoch 9/10
82s - loss: 0.3251
Epoch 10/10
82s - loss: 0.3258


<keras.callbacks.History at 0x120119590>

In [8]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('Rogue One')

(29, u'Rogue One', 1.0000001)
(19, u'Interstellar (film)', 0.95177352)
(78, u'The Dark Knight Rises', 0.94208515)
(37, u'Avatar (2009 film)', 0.93488532)
(245, u'Gravity (film)', 0.93028849)
(101, u'Prometheus (2012 film)', 0.92305899)
(6, u'The Martian (film)', 0.91487736)
(85, u'Inception', 0.91336185)
(35, u'Titanic (1997 film)', 0.91242737)
(3349, u'Star Wars: The Force Awakens', 0.91157913)


In [9]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lengths = np.linalg.norm(link_weights, axis=1)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link):
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

#similar_links('George Lucas')
similar_links('Ridley Scott')

(28849, u'Ridley Scott', 1.0)
(43281, u'Michael Mann (director)', 0.9273653)
(3572, u'Russell Crowe', 0.90101695)
(648, u'Central Intelligence Agency', 0.89117438)
(61820, u'Christian Bale', 0.88127202)
(55035, u'Category:Films about death', 0.87207139)
(13972, u'John Malkovich', 0.86916643)
(42123, u'Gary Oldman', 0.86848545)
(66371, u'Pietro Scalia', 0.86190081)
(34769, u'Tom Hardy', 0.85981435)


In [10]:
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']
y = np.asarray([1 for _ in best] + [0 for _ in worst])
X = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best + worst])
X.shape

(16, 30)

In [11]:
clf = svm.SVC(kernel='linear')
clf.fit(X, y) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
estimated_movie_ratings = clf.decision_function(normalized_movies)
best = np.argsort(estimated_movie_ratings)
print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])


best:
(134, u'Citizen Kane', 1.2534138896419882)
(18, u'Star Wars (film)', 1.1637167034149256)
(481, u'The Devil Wears Prada (film)', 1.0321495142367656)
(3349, u'Star Wars: The Force Awakens', 1.0004163085185958)
(70, u'Carol (film)', 0.9997918735374014)
worst:
(6388, u'Bring It On Again', -1.7589806427440513)
(8025, u"The Slammin' Salmon", -1.6625135207552726)
(3782, u'Air Bud (series)', -1.6249742562202758)
(4504, u'Teen Witch', -1.6248108506469687)
(4173, u'Countdown (2016 film)', -1.6172925286645881)


In [13]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])
rotten_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])

In [14]:
TRAINING_CUT_OFF = int(len(rotten_X) * 0.8)
regr = LinearRegression()
regr.fit(rotten_X[:TRAINING_CUT_OFF], rotten_y[:TRAINING_CUT_OFF])



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [15]:
error = (regr.predict(rotten_X[TRAINING_CUT_OFF:]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.06'

In [16]:
error = (np.mean(rotten_y[:TRAINING_CUT_OFF]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.09'

In [40]:
def gross(movie):
  v = movie[1].get('gross')
  if not v or not ' ' in v:
    return None
  v, unit = v.split(' ', 1)
  unit = unit.lower()
  if not unit in ('million', 'billion'):
    return None
  if not v.startswith('$'):
    return None
  try:
    v = float(v[1:])
  except ValueError:
    return None
  if unit == 'billion':
    v *= 1000
  return v

movie_gross = [gross(m) for m in movies]
highest = np.argsort(movie_gross)[-10:]
for c in reversed(highest):
    print(c, movies[c][0], movie_gross[c])

(7, u'List of Marvel Cinematic Universe films', 10900.0)
(8, u'X-Men (film series)', 4300.0)
(63, u'The Fast and the Furious', 3900.0)
(91, u'The Hobbit (film series)', 2932.0)
(221, u'The Dark Knight Trilogy', 2464.0)
(45, u'The Hunger Games (film series)', 2340.0)
(35, u'Titanic (1997 film)', 2187.0)
(3349, u'Star Wars: The Force Awakens', 2068.0)
(22, u'Jurassic World', 1670.0)
(4518, u'Men in Black (film series)', 1655.0)


In [43]:
gross_y = np.asarray([gr for gr in movie_gross if gr])
gross_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie, gr in zip(movies, movie_gross) if gr])

In [44]:
TRAINING_CUT_OFF = int(len(gross_X) * 0.8)
regr = LinearRegression()
regr.fit(gross_X[:TRAINING_CUT_OFF], gross_y[:TRAINING_CUT_OFF])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [62]:
error = (regr.predict(gross_X[TRAINING_CUT_OFF:]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 6758.32'

In [63]:
error = (np.mean(gross_y[:TRAINING_CUT_OFF]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 14115.59'