In [2]:
import requests
from bs4 import BeautifulSoup
import os
import time
try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve
import xml.sax
from sklearn import svm
import subprocess
import mwparserfromhell
import json
from collections import Counter
from itertools import chain
import numpy as np
import random
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors
import pickle
import gensim
from sklearn.decomposition import TruncatedSVD
import psycopg2

In [2]:
with open('data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

In [3]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])

top_links = [link for link, c in link_counts.items() if c >= 3]
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)

(949544, 66913, 10000)

In [4]:
def movie_embedding_model(embedding_size=30):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding', input_dim=len(top_links), output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', input_dim=len(movie_to_idx), output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

model = movie_embedding_model()

In [5]:
import random
random.seed(5)

def batchifier(pairs, positive_samples=50, negative_ratio=5):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([ 32643.,  32318.,  42152.,  13365.,  18175.,  20558.,   3801.,
          46728.,  48731.]),
  'movie': array([ 7628.,  7685.,  1529.,  6238.,  7236.,   849.,  5874.,  5530.,
          1854.])},
 array([-1., -1.,  1., -1.,  1., -1., -1.,  1., -1.]))

In [6]:
positive_samples_per_batch=256

model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=5),
    epochs=10,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2
)

Epoch 1/10
148s - loss: 0.5145
Epoch 2/10
152s - loss: 0.3522
Epoch 3/10
172s - loss: 0.3374
Epoch 4/10
182s - loss: 0.3289
Epoch 5/10
172s - loss: 0.3256
Epoch 6/10
167s - loss: 0.3235
Epoch 7/10
158s - loss: 0.3210
Epoch 8/10
152s - loss: 0.3204
Epoch 9/10
153s - loss: 0.3196
Epoch 10/10
148s - loss: 0.3196


<keras.callbacks.History at 0x1281d6400>

In [7]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('Rogue One')

29 Rogue One 1.0
101 Prometheus (2012 film) 0.95705
3349 Star Wars: The Force Awakens 0.955909
659 Rise of the Planet of the Apes 0.953989
25 Star Wars sequel trilogy 0.94565
61 Man of Steel (film) 0.943233
19 Interstellar (film) 0.942833
413 Superman Returns 0.940903
221 The Dark Knight Trilogy 0.94027
22 Jurassic World 0.938769


In [24]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T
nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(normalized_movies)

with open('data/movie_model.pkl', 'wb') as fout:
    pickle.dump({
        'nbrs': nbrs,
        'normalized_movies': normalized_movies,
        'movie_to_idx': movie_to_idx,
    }, fout)

In [29]:
with open('data/movie_model.pkl', 'rb') as fin:
    m = pickle.load(fin)
movie_names = [x[0] for x in sorted(movie_to_idx.items(), key=lambda t:t[1])]
distances, indices = m['nbrs'].kneighbors(
    [m['normalized_movies'][m['movie_to_idx']['Rogue One']]])
for idx in indices[0]:
    print(movie_names[idx])

Rogue One
Prometheus (2012 film)
Star Wars: The Force Awakens
Rise of the Planet of the Apes
Star Wars sequel trilogy
Man of Steel (film)
Interstellar (film)
Superman Returns
The Dark Knight Trilogy
Jurassic World


In [38]:
DB_NAME = 'douwe'
USER = 'djangosite'
PWD = 'z0g3h31m!'
HOST = '127.0.0.1'
connection_str = "dbname='%s' user='%s' password='%s' host='%s'"
conn = psycopg2.connect(connection_str % (DB_NAME, USER, PWD, HOST))

In [44]:
with conn.cursor() as cursor:
    cursor.execute('INSERT INTO movie (movie_name, embedding) VALUES (%s, %s)',
                   (movie_names[0], normalized_movies[0].tolist()))
conn.commit()

In [52]:
with conn.cursor() as cursor:
    cursor.execute('DELETE FROM movie;')
conn.commit()

In [53]:
with conn.cursor() as cursor:
    for movie, embedding in zip(movies, normalized_movies):
        cursor.execute('INSERT INTO movie (movie_name, embedding)'
                       ' VALUES (%s, %s)',
               (movie[0], embedding.tolist()))
conn.commit()

In [59]:
conn.rollback()

In [83]:
def recommend_movies(conn, q):
    with conn.cursor() as cursor:
        cursor.execute('SELECT movie_name, embedding FROM movie'
                       '    WHERE lower(movie_name) LIKE %s'
                       '    LIMIT 1',
                       ('%' + q.lower() + '%',))
        if cursor.rowcount == 0:
            return []
        movie_name, embedding = cursor.fetchone()
        cursor.execute('SELECT movie_name, '
                       '       cube_distance(cube(embedding), '
                       '                     cube(%s)) as distance '
                       '    FROM movie'
                       '    ORDER BY distance'
                       '    LIMIT 5',
                       (embedding,))
        return list(cursor.fetchall())
    
recommend_movies(conn, 'The Force Awakens')

[('Star Wars: The Force Awakens', 1.26221913570934e-15),
 ('Doctor Strange (film)', 0.294215885064129),
 ('Rogue One', 0.296954225761183),
 ('The Avengers (2012 film)', 0.299128019284761),
 ('Star Trek Into Darkness', 0.368348739947157)]

In [81]:
with conn.cursor() as cursor:
    cursor.execute('SELECT movie_name, cube_distance(cube(embedding), cube(%s)) as distance '
                   '    FROM movie'
                   '    ORDER BY distance'
                   '    LIMIT 5',
                   (emb,))
    x = list(cursor)
x

[('Star Wars: The Force Awakens', 1.26221913570934e-15),
 ('Doctor Strange (film)', 0.294215885064129),
 ('Rogue One', 0.296954225761183),
 ('The Avengers (2012 film)', 0.299128019284761),
 ('Star Trek Into Darkness', 0.368348739947157)]

In [63]:
movies[0]

['Deadpool (film)',
 {'Software Used': 'Adobe Premier Pro',
  'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
  'budget': '$58 million',
  'caption': 'Theatrical release poster',
  'cinematography': 'Ken Seng',
  'country': 'United States',
  'director': 'Tim Miller',
  'distributor': '20th Century Fox',
  'editing': 'Julian Clarke',
  'gross': '$783.1 million',
  'image': 'Deadpool poster.jpg',
  'language': 'English',
  'music': 'Tom Holkenborg',
  'name': 'Deadpool',
  'runtime': '108 minutes'},
 ['Tim Miller (director)',
  'Simon Kinberg',
  'Ryan Reynolds',
  'Lauren Shuler Donner',
  'Rhett Reese',
  'Paul Wernick',
  'Deadpool',
  'Fabian Nicieza',
  'Rob Liefeld',
  'Morena Baccarin',
  'Ed Skrein',
  'T.J. Miller',
  'Gina Carano',
  'Leslie Uggams',
  'Brianna Hildebrand',
  'Stefan Kapičić',
  'Junkie

In [3]:
MODEL = 'GoogleNews-vectors-negative300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(MODEL, binary=True)

In [4]:
model.most_similar(positive=['espresso'])

[('cappuccino', 0.688818633556366),
 ('mocha', 0.6686208248138428),
 ('coffee', 0.6616827249526978),
 ('latte', 0.6536752581596375),
 ('caramel_macchiato', 0.6491269469261169),
 ('ristretto', 0.6485545635223389),
 ('espressos', 0.6438629627227783),
 ('macchiato', 0.6428249478340149),
 ('chai_latte', 0.6308027505874634),
 ('espresso_cappuccino', 0.6280543804168701)]

In [18]:
def most_similar(norm, positive):
    vec = norm[model.vocab[positive].index]
    dists = np.dot(norm, vec)
    most_extreme = np.argpartition(-dists, 10)[:10]
    res = ((model.index2word[idx], dists[idx]) for idx in most_extreme)
    return list(sorted(res, key=lambda t:t[1], reverse=True))

for word, score in most_similar(model.syn0norm, 'espresso'):
    print(word, score)

espresso 1.0
cappuccino 0.688819
mocha 0.668621
coffee 0.661683
latte 0.653675
caramel_macchiato 0.649127
ristretto 0.648555
espressos 0.643863
macchiato 0.642825
chai_latte 0.630803


In [22]:
svd = TruncatedSVD(n_components=100, random_state=42, n_iter=40)
reduced = svd.fit_transform(model.syn0norm)

In [21]:
reduced_lengths = np.linalg.norm(reduced, axis=1)
normalized_reduced = (reduced.T / reduced_lengths).T
normalized_reduced.shape

(3000000, 100)

In [19]:
for word, score in most_similar(normalized_reduced, 'espresso'):
    print(word, score)

espresso 1.0
cappuccino 0.856463080029
chai_latte 0.835657488972
latte 0.800340435865
macchiato 0.798796776324
espresso_machine 0.791469456128
Lavazza_coffee 0.790783985201
mocha 0.788645681469
espressos 0.78424218748
martini 0.784037414689


In [15]:
for idx in most_extreme:
    print(model.index2word[idx], dists[idx])

espresso 1.0
mocha 0.668621
coffee 0.661683
cappuccino 0.688819
latte 0.653675
caramel_macchiato 0.649127
espressos 0.643863
ristretto 0.648555
macchiato 0.642825
chai_latte 0.630803
