In [1]:
import pywikibot
import queue
import gensim
import numpy as np
import warnings
warnings.filterwarnings('ignore')  # Suppresses gensim errors

### Things I've installed:
1. wikipedia (query wikipedia) - not used
2. pywikibot (more advanced queries)
3. wikiutils (read sql.gz files) - not yet used. possibly in the future
4. gensim (for NLP and specifically using Google's word2vec)

### Load word2vec embeddings

In [2]:
# Obviously override this to your local location
model_addr = '/Users/benjaminrafetto/Code/cs182/project/data/GoogleNews-vectors-negative300.bin'

In [3]:
model = gensim.models.KeyedVectors.load_word2vec_format(model_addr, binary=True)  

### Crude definition of distance

In [4]:
# Currently uses average word distances from word2vec embeddings
def get_distance(topic, model, goal):
    if topic in model:
        return model.distance(goal, topic)
    else:
        distances = [model.distance(goal, w) for w in topic.split(' ') if w in model]
        return np.average(distances) if distances else np.Infinity

### Connect to Wikipedia

In [5]:
site = pywikibot.Site("en", "wikipedia")

In [6]:
def greedy_word2vec_path(start, goal, maxIters=30, verbose=False):
    assert start in model and goal in model, f"Start and end nodes {start} and {goal} must be in word2vec vocabulary."
    start_page = pywikibot.Page(site, start)
    path = []
    visited = []
    fringe = queue.PriorityQueue()
    fringe.put((np.Inf, start_page))
    
    i = 0
    while i < maxIters and not fringe.empty():
        i += 1
        priority, page = fringe.get()
        path.append(page.title())
        if verbose:
            print("Exploring node {} with distance {}".format(page.title(), priority))
        if goal.lower() == page.title().lower():
            return path

        for p in page.linkedPages():
            if p.title() not in visited:
                visited.append(p.title())
                distance = get_distance(p.title(), model, goal)
                fringe.put((distance, p))

    raise Exception("Unable to find goal node.")

### Some example paths. Currently only supports start and goal nodes that are specifically in word2vec.
i.e. Sentences don't work

In [7]:
examples = [("speech", "lacrosse"),
            ("mantra", "dna"),
            ("Parthenon", "Environment"),  #"Natural Environment"
            ("Feces", "Poet")
#             ("penguin", "sans-serif"),
#             ("angelina jolie", "nitrogen"),
#             ("Carrie Fisher", "Death of Adolf Hitler"),
           ]

In [8]:
start, goal = examples[np.random.choice(len(examples))]
print(f"Searching for shortest path from {start} to {goal}")

Searching for shortest path from mantra to dna


In [9]:
greedy_word2vec_path(start, goal, maxIters=50)

['Mantra',
 'Rigveda',
 'Brahmanda Purana',
 'Brihaddharma Purana',
 'Devi-Bhagavata Purana',
 'Brahmavaivarta Purana',
 'Mudgala Purana',
 'Naradiya Purana',
 'Purana',
 'Category:Redirects to plurals',
 'Shivarahasya Purana',
 'Vishnudharmottara Purana',
 'PubMed Identifier',
 'PubMed',
 'Entrez',
 'PubChem',
 'GenBank',
 'RefSeq',
 'Ensembl',
 'Geneious',
 'Sequence database',
 'Biological database',
 'Biobank',
 'Caenorhabditis briggsae',
 'Caenorhabditis',
 'Caenorhabditis afra',
 'Caenorhabditis angaria',
 'Caenorhabditis brenneri',
 'Caenorhabditis castelli',
 'Caenorhabditis doughertyi',
 'Caenorhabditis drosophilae',
 'Caenorhabditis guadeloupensis',
 'Caenorhabditis imperialis',
 'Caenorhabditis inopinata',
 'Caenorhabditis latens',
 'Caenorhabditis macrosperma',
 'Caenorhabditis monodelphis',
 'Caenorhabditis nigoni',
 'Caenorhabditis nouraguensis',
 'Caenorhabditis plicata',
 'Caenorhabditis portoensis',
 'Caenorhabditis remanei',
 'Caenorhabditis sinica',
 'Caenorhabditis 