## Libraries

In [1]:
import re
import os
import time
import json
import urllib
import random
import requests
import xml.sax
import subprocess
import mwparserfromhell
import torch
import torch.nn as nn 
import numpy as np
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import Counter
from urllib.request import urlretrieve
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

## Fetch data from Wikipedia

In [None]:
# download most recent dump (compressed XML file)
url = 'https://dumps.wikimedia.org/enwiki-20200820-pages-articles.xml.bz2'

urllib.request.urlretrieve(url, './data/enwiki-20200820-pages-articles.xml.bz2')

## Parse data

In [13]:
# define content handler for XML parser
def process_article(title, text):
    '''
    For each movie we want to extract the name, outgoing links
    and properties stored in the infobox.
    '''
    rotten = [
        (re.findall('\d\d?\d?%', p), re.findall('\d\.\d\/\d+|$', p), p.lower().find('rotten tomatoes'))
        for p in text.split('\n\n')
    ]
    rating = next((
        (perc[0], rating[0])
        for perc, rating, idx in rotten
        if len(perc) == 1 and idx > -1)
    , (None, None))
    wikicode = mwparserfromhell.parse(text)
    film = next((
        template
        for template in wikicode.filter_templates() 
        if template.name.strip().lower() == 'infobox film')
    , None)
    if film:
        properties = {
            param.name.strip_code().strip(): param.value.strip_code().strip() 
            for param in film.params
            if param.value.strip_code().strip()
        }
        links = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]
        return (title, properties, links) + rating

In [15]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    '''
    For each <page> tag this collects the contents of the title and of the text into the
    self._values dictionary and calls process_article with the collected values.
    '''
    def __init__(self):
        super(WikiXmlHandler, self).__init__()
        self._buffer = None
        self._values = {}
        self._movies = []
        self._curent_tag = None

    def characters(self, content):
        if self._curent_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        if name in ('title', 'text'):
            self._curent_tag = name
            self._buffer = []

    def endElement(self, name):
        if name == self._curent_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            movie = process_article(**self._values)
            if movie:
                self._movies.append(movie)

In [None]:
# feed the compressed dump into the parser:
parser = xml.sax.make_parser()
handler = WikiXmlHandler()
parser.setContentHandler(handler)

dump_path = './data/enwiki-20200820-pages-articles.xml.bz2'

for line in subprocess.Popen(['bzcat'], stdin = open(dump_path), stdout = subprocess.PIPE).stdout:
    try:
        parser.feed(line)
    except StopIteration:
        break

## Save data

In [18]:
# save as ndjson
with open('./data/wp_movies.ndjson', 'wt') as fout:
    for movie in handler._movies:
         fout.write(json.dumps(movie) + '\n')

## Create dataset from movie-link matches

In [2]:
with open('./data/wp_movies.ndjson') as file:
    movies = [json.loads(line) for line in file]

In [3]:
link_counts = Counter()

for movie in movies:
    link_counts.update(movie[2]) # outgoing links are at this index as a list

link_counts.most_common(10)

[('Rotten Tomatoes', 4382),
 ('The New York Times', 3252),
 ('Category:American films', 3134),
 ('Variety (magazine)', 2921),
 ('Category:English-language films', 2905),
 ('Metacritic', 2178),
 ('Roger Ebert', 1863),
 ('Los Angeles Times', 1757),
 ('Box Office Mojo', 1756),
 ('American Film Institute', 1279)]

In [4]:
# drop link types with count < 3
top_links = [link for link, count in link_counts.items() if count >= 3]

# create an index of links (vocabulary)
link_to_idx = {link: i for i, link in enumerate(top_links)}

# create an index of movies (vocabulary)
movie_to_idx = {movie[0]: i for i, movie in enumerate(movies)}

len(top_links), len(movie_to_idx)

(41105, 4443)

In [5]:
# create a dataset of link-movie matches (pairs of indices)
pairs = []
for movie in movies: # 1. for each movie
    pairs.extend(
        (link_to_idx[link], movie_to_idx[movie[0]]) # 4. save the movie-link pairs (as indices)
        for link in movie[2] # 2. check all links
        if link in link_to_idx # 3. for all links in vocabulary
    )

# save also as a set for efficient look-up
pairs_set = set(pairs)

# create dataframe
pairs_df = pd.DataFrame(pairs)
pairs_df['target'] = 1

pairs_df

Unnamed: 0,0,1,target
0,0,0,1
1,1,0,1
2,2,0,1
3,3,0,1
4,4,0,1
...,...,...,...
453827,17085,4442,1
453828,995,4442,1
453829,12778,4442,1
453830,126,4442,1


In [6]:
# 5 times more negative matches
nonpairs = []
while len(nonpairs) < 5*len(pairs):
    # generate random indices
    link_id = random.randrange(len(top_links))
    movie_id = random.randrange(len(movie_to_idx))
    # check if they are not a match
    if (link_id, movie_id) not in pairs_set:
        nonpairs.append((link_id, movie_id))
        
# save also as a set for efficient look-up
nonpairs_set = set(nonpairs)

# create dataframe
nonpairs_df = pd.DataFrame(nonpairs)
nonpairs_df['target'] = -1

nonpairs_df

Unnamed: 0,0,1,target
0,31705,2509,-1
1,34568,2836,-1
2,38494,2077,-1
3,2419,3973,-1
4,24667,3795,-1
...,...,...,...
2269155,37615,1510,-1
2269156,4163,1968,-1
2269157,7403,3646,-1
2269158,18800,1941,-1


In [7]:
df = pd.concat([pairs_df, nonpairs_df], axis = 0)
df.columns = ['link_id', 'movie_id', 'target']

df = df.reset_index(drop = True)

df

Unnamed: 0,link_id,movie_id,target
0,0,0,1
1,1,0,1
2,2,0,1
3,3,0,1
4,4,0,1
...,...,...,...
2722987,37615,1510,-1
2722988,4163,1968,-1
2722989,7403,3646,-1
2722990,18800,1941,-1


## Split data

In [8]:
train_df, test_df = train_test_split(
    df,
    stratify = df['target'],
    test_size = 0.2,
    shuffle = True,
    random_state = 2020
)

train_df, val_df = train_test_split(
    train_df,
    stratify = train_df['target'],
    test_size = 0.2,
    shuffle = True,
    random_state = 2020
)

train_df = train_df.reset_index(drop = True)
val_df = val_df.reset_index(drop = True)
test_df = test_df.reset_index(drop = True)

x_train = train_df.loc[:, ['link_id', 'movie_id']]
y_train = train_df.loc[:, 'target']

x_val = val_df.loc[:, ['link_id', 'movie_id']]
y_val = val_df.loc[:, 'target']

x_test = test_df.loc[:, ['link_id', 'movie_id']]
y_test = test_df.loc[:, 'target']

print(f'Train: {train_df.shape}')
print(f'Val: {val_df.shape}')
print(f'Test: {test_df.shape}')

Train: (1742714, 3)
Val: (435679, 3)
Test: (544599, 3)


## Create dataloaders

In [9]:
PLATEAU_PATIENCE = 1
BATCH_SIZE = 64
EARLY_PATIENCE = 3
NUM_EPOCHS = 15

In [10]:
class MovieDataset(Dataset):    
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        x_1, x_2 = self.inputs.loc[index, 'link_id'], self.inputs.loc[index, 'movie_id']

        if self.targets is not None:
            y = self.targets.iloc[index]
            return (x_1, x_2), y # int64 & int64 ¤ float32
        else:
            return (x_1, x_2)

dataset_train = MovieDataset(x_train, y_train)
dataset_val = MovieDataset(x_val, y_val)
dataset_test = MovieDataset(x_test, y_test)

train_gen = DataLoader(dataset_train, batch_size = BATCH_SIZE, shuffle = True)
val_gen = DataLoader(dataset_val, batch_size = BATCH_SIZE, shuffle = True)
test_gen = DataLoader(dataset_test, batch_size = 1, shuffle = False)

## Train embeddings using outgoing links

The model’s task is to determine whether a certain link can be found on the Wikipedia page of a movie, so we need to feed it labeled examples of matches and non‐matches (y = {-1, 1}). We take both the link_id and the movie_id as a number and feed those into their respective embedding layers. The embedding layer will allocate a vector of embedding_dim for each possible input. We use cosine distance as the loss. The model will learn weights such that this normed dot product will be close to the label. This forces the network to project movies into a space such that similar movies end up in a similar location. We can use this space to find similar movies.

In [11]:
class MovieModel(nn.Module):
    def __init__(self, embedding_dim = 50):
        super().__init__()
        self.movie_embedding = nn.Embedding(len(movie_to_idx), embedding_dim)
        self.link_embedding = nn.Embedding(len(top_links), embedding_dim)
        
    def forward(self, links, movies):
        embedded_movies = self.movie_embedding(movies)
        embedded_links = self.link_embedding(links)
        return embedded_movies, embedded_links

device = torch.device('cuda:0')

model = MovieModel().to(device)
loss_fun = nn.CosineEmbeddingLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3, weight_decay = 5e-3)

print(model)

MovieModel(
  (movie_embedding): Embedding(4443, 50)
  (link_embedding): Embedding(41105, 50)
)


In [13]:
patience_counter = EARLY_PATIENCE
best_val_loss = 999

scheduler = ReduceLROnPlateau(
    optimizer,
    mode = 'min',
    patience = PLATEAU_PATIENCE,
    factor = 0.2,
    verbose = True
)

for i_epoch in range(NUM_EPOCHS):
    # ------------- Optimization on training data -------------
    train_losses = []
    val_losses = []
    preds_train = []
    labels_train = []
    model.train() # set train mode
    for ((link_ids, movie_ids), targets) in tqdm(train_gen):
        link_ids = link_ids.to(device, dtype = torch.int64)
        movie_ids = movie_ids.to(device, dtype = torch.int64)
        targets = targets.to(device, dtype = torch.float32)  
        optimizer.zero_grad() # clear gradients
        embedded_links, embedded_movies = model(link_ids, movie_ids)
        loss = loss_fun(embedded_links, embedded_movies, targets)
        train_losses.append(loss.item())
        loss.backward() # compute gradient
        optimizer.step() # update parameters
        
    # ------------- Evaluation on validation data -------------
    preds_val = []
    model.eval() # set evaluation mode
    with torch.no_grad():
        for ((link_ids, movie_ids), targets) in tqdm(val_gen):
            link_ids = link_ids.to(device, dtype = torch.int64)
            movie_ids = movie_ids.to(device, dtype = torch.int64)
            targets = targets.to(device, dtype = torch.float32)  
            embedded_links, embedded_movies = model(link_ids, movie_ids)
            loss = loss_fun(embedded_links, embedded_movies, targets)
            val_losses.append(loss.item())
    
    # ------------- Display progress -------------
    print(f'{i_epoch+1} | Train loss: {np.mean(train_losses):.4f} | Val loss: {np.mean(val_losses):.4f}')
    
    # ------------- Check learning plateau criterion -------------
    curr_val_loss = np.mean(val_losses)
    scheduler.step(curr_val_loss)
    
    # ------------- Check early stopping criterion -------------
    if curr_val_loss < best_val_loss:
        best_val_loss = curr_val_loss
        patience_counter = EARLY_PATIENCE # reset patience counter
        torch.save(model, './models/model.pth')
    else:
        patience_counter -= 1
        if patience_counter == 0:
            print('Early stopping')
            break

100%|██████████| 27230/27230 [01:45<00:00, 258.51it/s]
100%|██████████| 6808/6808 [00:10<00:00, 666.17it/s]
  0%|          | 1/27230 [00:00<48:08,  9.43it/s]

1 | Train loss: 0.2124 | Val loss: 0.2106


100%|██████████| 27230/27230 [01:43<00:00, 262.68it/s]
100%|██████████| 6808/6808 [00:11<00:00, 582.16it/s]
  0%|          | 1/27230 [00:00<59:39,  7.61it/s]

2 | Train loss: 0.1933 | Val loss: 0.2057


100%|██████████| 27230/27230 [01:46<00:00, 256.55it/s]
100%|██████████| 6808/6808 [00:11<00:00, 573.64it/s]
  0%|          | 1/27230 [00:00<55:32,  8.17it/s]

3 | Train loss: 0.1738 | Val loss: 0.1955


100%|██████████| 27230/27230 [01:47<00:00, 253.39it/s]
100%|██████████| 6808/6808 [00:11<00:00, 596.79it/s]
  0%|          | 1/27230 [00:00<46:33,  9.75it/s]

4 | Train loss: 0.1538 | Val loss: 0.1806


100%|██████████| 27230/27230 [01:44<00:00, 260.86it/s]
100%|██████████| 6808/6808 [00:11<00:00, 616.94it/s]
  0%|          | 1/27230 [00:00<51:08,  8.87it/s]

5 | Train loss: 0.1376 | Val loss: 0.1686


100%|██████████| 27230/27230 [01:44<00:00, 260.80it/s]
100%|██████████| 6808/6808 [00:11<00:00, 584.47it/s]
  0%|          | 1/27230 [00:00<49:38,  9.14it/s]

6 | Train loss: 0.1277 | Val loss: 0.1619


100%|██████████| 27230/27230 [01:48<00:00, 250.94it/s]
100%|██████████| 6808/6808 [00:11<00:00, 584.92it/s]
  0%|          | 1/27230 [00:00<54:19,  8.35it/s]

7 | Train loss: 0.1216 | Val loss: 0.1580


100%|██████████| 27230/27230 [01:45<00:00, 258.16it/s]
100%|██████████| 6808/6808 [00:12<00:00, 566.85it/s]
  0%|          | 1/27230 [00:00<46:46,  9.70it/s]

8 | Train loss: 0.1171 | Val loss: 0.1551


100%|██████████| 27230/27230 [01:49<00:00, 249.57it/s]
100%|██████████| 6808/6808 [00:12<00:00, 561.41it/s]
  0%|          | 1/27230 [00:00<54:19,  8.35it/s]

9 | Train loss: 0.1136 | Val loss: 0.1527


100%|██████████| 27230/27230 [01:49<00:00, 249.57it/s]
100%|██████████| 6808/6808 [00:13<00:00, 520.11it/s]
  0%|          | 1/27230 [00:00<1:07:16,  6.75it/s]

10 | Train loss: 0.1106 | Val loss: 0.1505


100%|██████████| 27230/27230 [01:51<00:00, 244.51it/s]
100%|██████████| 6808/6808 [00:12<00:00, 564.79it/s]
  0%|          | 1/27230 [00:00<52:07,  8.71it/s]

11 | Train loss: 0.1081 | Val loss: 0.1484


100%|██████████| 27230/27230 [01:49<00:00, 249.38it/s]
100%|██████████| 6808/6808 [00:11<00:00, 578.78it/s]
  0%|          | 1/27230 [00:00<53:36,  8.46it/s]

12 | Train loss: 0.1061 | Val loss: 0.1467


100%|██████████| 27230/27230 [01:43<00:00, 263.43it/s]
100%|██████████| 6808/6808 [00:11<00:00, 579.02it/s]
  0%|          | 1/27230 [00:00<49:27,  9.18it/s]

13 | Train loss: 0.1044 | Val loss: 0.1452


100%|██████████| 27230/27230 [01:45<00:00, 258.76it/s]
100%|██████████| 6808/6808 [00:11<00:00, 599.59it/s]
  0%|          | 1/27230 [00:00<50:24,  9.00it/s]

14 | Train loss: 0.1031 | Val loss: 0.1437


100%|██████████| 27230/27230 [01:48<00:00, 250.96it/s]
100%|██████████| 6808/6808 [00:12<00:00, 559.16it/s]

15 | Train loss: 0.1021 | Val loss: 0.1426





## Evaluate embeddings

In [14]:
# norm the movie embeddings
movie_weights = model.movie_embedding.weight.detach().cpu()
movie_norm = movie_weights.norm(p = 2, dim = 1, keepdim = True)
normalized_movies = movie_weights.div(movie_norm.expand_as(movie_weights)) # E / E.norm
normalized_movies = normalized_movies.numpy()

# norm the link embeddings
link_weights = model.link_embedding.weight.detach().cpu()
link_norm = link_weights.norm(p = 2, dim = 1, keepdim = True)
normalized_links = link_weights.div(link_norm.expand_as(link_weights))
normalized_links = normalized_links.numpy()

In [15]:
def similar_movies(movie):
    # get input's cosine simimilarity for all other movies
    cosine_sims = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    # save indices of top 10 with smallest distance
    closest = np.argsort(cosine_sims)[-10:]
    for i in reversed(closest):
        print(i, movies[i][0], cosine_sims[i])

def similar_links(link):
    # get input's cosine simimilarity for all other links
    cosine_sims = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    # save indices of top 10 with smallest distance
    closest = np.argsort(cosine_sims)[-10:]
    for i in reversed(closest):
        print(i, top_links[i], cosine_sims[i])

In [16]:
similar_movies('Batman Forever')

11 Batman Forever 1.0
8 Batman (1989 film) 0.9592697
10 Batman Returns 0.9484402
2052 Batman Begins 0.87854695
1177 Ghostbusters 0.865731
2421 Ghostbusters II 0.86100507
1840 Contact (1997 American film) 0.8562398
624 Die Hard 0.8517697
1798 Hulk (film) 0.8444223
196 Sleepy Hollow (film) 0.8411691


In [17]:
similar_links('George Lucas')

4222 George Lucas 1.0
5848 John Williams 0.9548348
2368 Saturn Award for Best Music 0.9440633
567 Saturn Award for Best Science Fiction Film 0.9398098
5593 Saturn Award for Best Writing 0.9393413
3784 Saturn Award for Best Actor 0.9390022
466 Raiders of the Lost Ark 0.9385539
437 Hugo Award for Best Dramatic Presentation 0.93779016
566 Saturn Award for Best Director 0.9321122
1637 Jurassic Park (film) 0.9320031
