In [2]:
import pywikibot
import queue
import gensim
import numpy as np
import warnings
warnings.filterwarnings('ignore')  # Suppresses gensim errors

### Things I've installed:
1. wikipedia (query wikipedia) - not used
2. pywikibot (more advanced queries)
3. wikiutils (read sql.gz files) - not yet used. possibly in the future
4. gensim (for NLP and specifically using Google's word2vec)

### Load word2vec embeddings

In [3]:
# Obviously override this to your local location
#model_addr = '/Users/benjaminrafetto/Code/cs182/project/data/GoogleNews-vectors-negative300.bin'

# claire's address 
model_addr = '/Users/clairestolz/CS182/CS182-Final-Project/data/GoogleNews-vectors-negative300.bin'

In [4]:
model = gensim.models.KeyedVectors.load_word2vec_format(model_addr, binary=True)  

### Crude definition of distance

In [5]:
# Currently uses average word distances from word2vec embeddings
def get_distance(topic, model, goal):
    if topic in model:
        return model.distance(goal, topic)
    else:
        distances = [model.distance(goal, w) for w in topic.split(' ') if w in model]
        return np.average(distances) if distances else np.Infinity

### Connect to Wikipedia

In [6]:
site = pywikibot.Site("en", "wikipedia")

In [8]:
def greedy_word2vec_path(start, goal, maxIters=30, verbose=False):
    assert start in model and goal in model, "Start and end nodes {} and {} must be in word2vec vocabulary.".format(start,goal)
    start_page = pywikibot.Page(site, start)
    path = []
    visited = []
    fringe = queue.PriorityQueue()
    fringe.put((np.Inf, start_page))
    
    i = 0
    while i < maxIters and not fringe.empty():
        i += 1
        priority, page = fringe.get()
        path.append(page.title())
        if verbose:
            print("Exploring node {} with distance {}".format(page.title(), priority))
        if goal.lower() == page.title().lower():
            return path

        for p in page.linkedPages():
            if p.title() not in visited:
                visited.append(p.title())
                distance = get_distance(p.title(), model, goal)
                fringe.put((distance, p))

    raise Exception("Unable to find goal node.")

### Some example paths. Currently only supports start and goal nodes that are specifically in word2vec.
i.e. Sentences don't work

In [9]:
examples = [("speech", "lacrosse"),
            ("mantra", "dna"),
            ("Parthenon", "Environment"),  #"Natural Environment"
            ("Feces", "Poet")
#             ("penguin", "sans-serif"),
#             ("angelina jolie", "nitrogen"),
#             ("Carrie Fisher", "Death of Adolf Hitler"),
           ]

In [12]:
start, goal = examples[np.random.choice(len(examples))]
print("Searching for shortest path from {} to {}".format(start,goal))

Searching for shortest path from speech to lacrosse


In [13]:
greedy_word2vec_path(start, goal, maxIters=50)

['Speech',
 'Grammar',
 'Grammar school',
 'Yeshiva',
 'One-room school',
 'State-integrated school',
 'Ungraded school',
 'University-preparatory school',
 'College-preparatory school',
 'College Prep',
 'Talk:Ungraded school',
 'College',
 'College football',
 'College lacrosse',
 'Lacrosse']

In [None]:
# BFS search to get a baseline for performance 
def bfs_wiki(start, goal, maxDepth=30, verbose=False):
    assert start in model and goal in model, "Start and end nodes {} and {} must be in word2vec vocabulary.".format(start,goal)
    start_page = pywikibot.Page(site, start)
    path = []
    visited = []
    fringe = queue.PriorityQueue()
    fringe.put((np.Inf, start_page))
    
    i = 0
    while i < maxIters and not fringe.empty():
        i += 1
        priority, page = fringe.get()
        path.append(page.title())
        if verbose:
            print("Exploring node {} with distance {}".format(page.title(), priority))
        if goal.lower() == page.title().lower():
            return path

        for p in page.linkedPages():
            if p.title() not in visited:
                visited.append(p.title())
                distance = get_distance(p.title(), model, goal)
                fringe.put((distance, p))

    raise Exception("Unable to find goal node.")