In [341]:
import requests
from bs4 import BeautifulSoup
import os
import time
try:
  from urllib.request import urlretrieve
except ImportError:
  from urllib import urlretrieve
import xml.sax
from sklearn import svm
import subprocess
import mwparserfromhell
import json
from collections import Counter
from itertools import chain
import numpy as np
import random
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot
from sklearn.linear_model import LinearRegression

In [15]:
index = requests.get('https://dumps.wikimedia.org/enwiki/').text

In [16]:
soup_index = BeautifulSoup(index, 'html.parser')

In [29]:
dumps = [a['href'] for a in soup_index.find_all('a') 
             if a.has_attr('href') and a.text[:-1].isdigit()]
dumps

[u'20170301/',
 u'20170320/',
 u'20170401/',
 u'20170420/',
 u'20170501/',
 u'20170520/',
 u'20170601/',
 u'20170620/']

In [35]:
for dump_url in sorted(dumps, reverse=True):
  print(dump_url)
  dump_html = index = requests.get('https://dumps.wikimedia.org/enwiki/' + dump_url).text
  soup_dump = BeautifulSoup(dump_html, 'html.parser')
  pages_xml = [a['href'] for a in soup_dump.find_all('a') 
             if a.has_attr('href') and a['href'].endswith('-pages-articles.xml.bz2')]
  if pages_xml:
    break
  time.sleep(0.8)

20170620/
20170601/


In [121]:
wikipedia_dump = pages_xml[0].rsplit('/')[-1]
wikipedia_dump = '/Users/douwe/Downloads/enwiki-20161201-pages-articles.xml.bz2'
if not os.path.isfile(wikipedia_dump):
  url = 'https://dumps.wikimedia.org/' + pages_xml[0]
  urlretrieve(url, wikipedia_dump)


In [153]:
def process_article(title, text):
  rotten = [(re.findall('\d\d?\d?%', p), re.findall('\d\.\d\/\d+|$', p), p.lower().find('rotten tomatoes')) for p in text.split('\n\n')]
  rating = next(((perc[0], rating[0]) for perc, rating, idx in rotten if len(perc) == 1 and idx > -1), (None, None))
  wikicode = mwparserfromhell.parse(text)
  film = next((template for template in wikicode.filter_templates() 
               if template.name.strip().lower() == 'infobox film'), None)
  if film:
    properties = {param.name.strip_code().strip(): param.value.strip_code().strip() 
                  for param in film.params
                  if param.value.strip_code().strip()
                 }
    links = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]
    return (title, properties, links) + rating

In [159]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
  def __init__(self):
    xml.sax.handler.ContentHandler.__init__(self)
    self._buffer = None
    self._values = {}
    self._movies = []
    self._curent_tag = None

  def characters(self, content):
    if self._curent_tag:
      self._buffer.append(content)

  def startElement(self, name, attrs):
    if name in ('title', 'text'):
      self._curent_tag = name
      self._buffer = []

  def endElement(self, name):
    if name == self._curent_tag:
      self._values[name] = ' '.join(self._buffer)

    if name == 'page':
      movie = process_article(**self._values)
      if movie:
        self._movies.append(movie)


In [160]:
parser = xml.sax.make_parser()
handler = WikiXmlHandler()
parser.setContentHandler(handler)
for line in subprocess.Popen(['bzcat'], stdin=open(wikipedia_dump), stdout=subprocess.PIPE).stdout:
  try:
    parser.feed(line)
  except StopIteration:
    break

ParserError: This is a bug and should be reported. Info: C tokenizer exited with BAD_ROUTE.

In [162]:
with open('wp_movies.ndjson', 'wt') as fout:
  for movie in handler._movies:
    fout.write(json.dumps(movie) + '\n')

In [290]:
with open('wp_movies.ndjson') as fin:
  movies = [json.loads(l) for l in fin]

In [242]:
link_counts = Counter()
for movie in movies:
  link_counts.update(movie[2])
link_counts.most_common(10)

[(u'Rotten Tomatoes', 9393),
 (u'Category:English-language films', 5882),
 (u'Category:American films', 5867),
 (u'Variety (magazine)', 5450),
 (u'Metacritic', 5112),
 (u'Box Office Mojo', 4186),
 (u'The New York Times', 3818),
 (u'The Hollywood Reporter', 3553),
 (u'Roger Ebert', 2707),
 (u'Los Angeles Times', 2454)]

In [244]:
top_links = [link for link, c in link_counts.items() if c >= 3]
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)

(949544, 66913, 10000)

In [245]:
def movie_embedding_model(embedding_size=30):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding', input_dim=len(top_links), output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', input_dim=len(movie_to_idx), output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

model = movie_embedding_model()

In [246]:
import random
random.seed(5)

def batchifier(pairs, positive_samples=50, negative_ratio=5):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([ 36384.,  49508.,  63122.,   1940.,  22831.,  44562.,  31386.,
          60281.,  62614.]),
  'movie': array([ 2465.,  9424.,  4656.,  9223.,  4734.,  6901.,  1132.,  6489.,
          6207.])},
 array([-1., -1., -1., -1.,  1.,  1., -1., -1.,  1.]))

In [247]:
positive_samples_per_batch=256

model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=5),
    epochs=25,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2
)

Epoch 1/25
116s - loss: 0.5132
Epoch 2/25
82s - loss: 0.3552
Epoch 3/25
81s - loss: 0.3495
Epoch 4/25
81s - loss: 0.3390
Epoch 5/25
82s - loss: 0.3349
Epoch 6/25
80s - loss: 0.3316
Epoch 7/25
113s - loss: 0.3340
Epoch 8/25
166s - loss: 0.3338
Epoch 9/25
149s - loss: 0.3327
Epoch 10/25
143s - loss: 0.3279
Epoch 11/25
92s - loss: 0.3231
Epoch 12/25
86s - loss: 0.3231
Epoch 13/25
83s - loss: 0.3226
Epoch 14/25
80s - loss: 0.3208
Epoch 15/25
86s - loss: 0.3226
Epoch 16/25
94s - loss: 0.3269
Epoch 17/25
84s - loss: 0.3212
Epoch 18/25
86s - loss: 0.3232
Epoch 19/25
92s - loss: 0.3229
Epoch 20/25
91s - loss: 0.3225
Epoch 21/25
86s - loss: 0.3197
Epoch 22/25
83s - loss: 0.3195
Epoch 23/25
79s - loss: 0.3217
Epoch 24/25
79s - loss: 0.3175
Epoch 25/25
88s - loss: 0.3209


<keras.callbacks.History at 0x14c90fd50>

In [253]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('Rogue One')

(29, u'Rogue One', 1.0)
(3349, u'Star Wars: The Force Awakens', 0.97035921)
(19, u'Interstellar (film)', 0.96064037)
(101, u'Prometheus (2012 film)', 0.96027517)
(659, u'Rise of the Planet of the Apes', 0.95979834)
(25, u'Star Wars sequel trilogy', 0.95580703)
(85, u'Inception', 0.94961089)
(78, u'The Dark Knight Rises', 0.94925165)
(160, u'Jupiter Ascending', 0.94668275)
(372, u'The Amazing Spider-Man (2012 film)', 0.9456107)


In [257]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lengths = np.linalg.norm(link_weights, axis=1)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link):
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

#similar_links('George Lucas')
similar_links('Ridley Scott')

(28849, u'Ridley Scott', 1.0000001)
(31569, u'Academy Award for Best Sound Editing', 0.90983206)
(56826, u'List of films featuring surveillance', 0.86210239)
(28860, u'Time Out (company)', 0.86097997)
(62900, u'Christopher Nolan', 0.85351568)
(64214, u'American Cinematographer', 0.85347492)
(648, u'Central Intelligence Agency', 0.84286451)
(47167, u'Blade Runner', 0.84009099)
(44048, u'MSN', 0.83480686)
(12177, u'HD DVD', 0.83219337)


In [321]:
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']
y = np.asarray([1 for _ in best] + [0 for _ in worst])
X = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best + worst])
X.shape

(16, 30)

In [324]:
clf = svm.SVC(kernel='linear')
clf.fit(X, y) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [330]:
estimated_movie_ratings = clf.decision_function(normalized_movies)
best = np.argsort(estimated_movie_ratings)
print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])


best:
(6870, u'Goodbye to Language', 1.24075226186855)
(6048, u'The Apu Trilogy', 1.2011876298842317)
(481, u'The Devil Wears Prada (film)', 1.1759994747169913)
(307, u'Les Mis\xe9rables (2012 film)', 1.1646775074857494)
(2106, u'A Separation', 1.1483743944891462)
worst:
(7889, u'The Comebacks', -1.5175929012505527)
(8837, u'The Santa Clause (film series)', -1.4651252650867073)
(2518, u'The Hot Chick', -1.464982008376793)
(6285, u'Employee of the Month (2006 film)', -1.4620595013243951)
(7339, u'Club Dread', -1.4593221506016203)


In [339]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])
rotten_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])

(5584, 30)

In [360]:
TRAINING_CUT_OFF = int(len(rotten_X) * 0.8)
regr = LinearRegression()
regr.fit(rotten_X[:TRAINING_CUT_OFF], rotten_y[:TRAINING_CUT_OFF])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [366]:
error = (regr.predict(rotten_X[TRAINING_CUT_OFF:]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.06'

In [367]:
error = (np.mean(rotten_y[:TRAINING_CUT_OFF]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.09'

In [None]:
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, regr.predict(diabetes_X_test), color='blue',
         linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

In [348]:
ck = movies[movie_to_idx['Indiana_Jones_and_the_Kingdom_of_the_Crystal_Skull'.replace('_', ' ')]]
ck[0], ck[-2]

(u'Indiana Jones and the Kingdom of the Crystal Skull', u'77%')

In [350]:
regr.predict([normalized_movies[movie_to_idx['Indiana_Jones_and_the_Kingdom_of_the_Crystal_Skull'.replace('_', ' ')]]])

array([ 0.78593689])