In [1]:
import pywikibot
import queue
import gensim
import numpy as np
import warnings
warnings.filterwarnings('ignore')  # Suppresses gensim errors

### Things I've installed:
1. wikipedia (query wikipedia) - not used
2. pywikibot (more advanced queries)
3. wikiutils (read sql.gz files) - not yet used. possibly in the future
4. gensim (for NLP and specifically using Google's word2vec)

### Load word2vec embeddings

In [2]:
# Obviously override this to your local location
model_addr = '/Users/benjaminrafetto/Code/cs182/project/data/GoogleNews-vectors-negative300.bin'

# claire's address 
# model_addr = '/Users/clairestolz/CS182/CS182-Final-Project/data/GoogleNews-vectors-negative300.bin'

In [3]:
model = gensim.models.KeyedVectors.load_word2vec_format(model_addr, binary=True)  

### Crude definition of distance

In [15]:
# Currently uses average word distances from word2vec embeddings
def get_distance_new(topic, model, goal):
    assert type(topic) is list and type(goal) is list
    try:
        distances = [model.distance(x, y) for x in topic for y in goal]
        return (np.average(distances) + np.min(distances) / 2.0)  # Combination of average and minimum
    except:
        return np.Infinity


### Connect to Wikipedia

In [6]:
site = pywikibot.Site("en", "wikipedia")

In [27]:
def greedy_word2vec_path(start, goal, maxIters=30, verbose=False):
    start_list = process_word(start)
    goal_list = process_word(goal)
    assert get_distance_new(start_list, model, goal_list) is not np.inf, "Start and end nodes {} and {} must be in word2vec vocabulary.".format(start, goal)
    
    start_page = pywikibot.Page(site, start)
    path = []
    visited = []
    fringe = queue.PriorityQueue()
    fringe.put((np.Inf, start_page))
    
    i = 0
    while i < maxIters and not fringe.empty():
        i += 1
        priority, page = fringe.get()
        path.append(page.title())
        if verbose:
            print("Exploring node {} with distance {}".format(page.title(), priority))
        if goal.lower() == page.title().lower():
            return path

        for p in page.linkedPages():
            if p.title() not in visited:
                visited.append(p.title())
                processed = process_word(p.title())
#                 print(p.title(), processed)
                distance = get_distance_new(processed, model, goal_list)
                fringe.put((distance, p))

    raise Exception("Unable to find goal node.")

# Some example paths. Currently only supports start and goal nodes that are specifically in word2vec.
i.e. Sentences don't work

In [95]:
t1 = "Natural Environment"
t2 = "Angelina Jolie"
t3 = "Carrie Fisher"
t4 = "Death of Adolf Hitler"
t5 = t4.split(' ')
test = [t5[:1], t5[1:]]

In [102]:
examples = [("speech", "lacrosse"),
            ("mantra", "dna"),
            ("Parthenon", "Natural Environment"),
            ("Feces", "Poet"),
#             ("penguin", "sans-serif"),  #sans-serif is not in the dictionary
            ("angelina jolie", "nitrogen"),
            ("Carrie Fisher", "Death of Adolf Hitler"),
            ("Lacrosse", "Comedian"),
            ("Dictionary", "Atmosphere of Earth"),
            ("Broadway theatre", "Wall Street"),
            ("Life expectancy", "Graphical User Interface"),
            ("Diazepam", "Death"),
            ("Moors", "Aryan"),
            ("Michelangelo", "Horror Fiction"),
           ]

In [37]:
start, goal = examples[np.random.choice(len(examples))]
print("Searching for shortest path from {} to {}".format(start,goal))

Searching for shortest path from Parthenon to Natural Environment


In [105]:
for start, goal in examples[-5:]:
    print(greedy_word2vec_path(start, goal, maxIters=50, verbose=False))

Splitting at 1 ['Atmosphere', 'of Earth']
Processed [['Atmosphere'], ['of', 'Earth']]
Splitting at 2 ['Atmosphere of', 'Earth']
Processed [['Atmosphere', 'of'], ['Earth']]
Combinations [[['Atmosphere'], ['of', 'Earth']], [['Atmosphere', 'of'], ['Earth']]]
Best [['Atmosphere'], ['of', 'Earth']]


AssertionError: Start and end nodes Dictionary and Atmosphere of Earth must be in word2vec vocabulary.

In [104]:
# Hideously ugly code.

def process_word(topic, model=model):
    return unembed(process_word_rec(topic, model))

def unembed(list_of_lists):
    results = []
    for l in list_of_lists:
        if type(l) is list:
            results += [l2 for l2 in unembed(l)]
        else:
            results.append(l)
    return results

def process_word_rec(topic, model):
    if topic in model:
        return [topic]
    if topic.replace(' ', '_') in model:
        return [topic.replace(' ', '_')]
    # Recursively split phrases into longest subphrases that are in our model
    words = topic.split(' ')
    if len(words) == 1:
        return []
    if len(words) == 2:
        return words
    
    combinations = []
    # If too long for the exponential approach just split in half:
    split_range = [len(words) // 2] if len(words) > 6 else range(1, len(words))
    for i in split_range:
        two = [' '.join(w) for w in [words[:i], words[i:]]]
        print("Splitting at {}".format(i), two)
        two_processed = [process_word_rec(t, model) for t in two]
        combinations.append(two_processed)
        print("Processed", two_processed)
    
    best = min(combinations, key=lambda x: len(x))
    print("Combinations", combinations)
    print("Best", best)
    return min(combinations, key=lambda x: len(x))

In [240]:
def process_word_rec(topic, model=model):
#     print("Topic:", topic)
    if not topic:  #No topic. Stop.
        return []
    
    if topic in model:
        return [topic]

    output = []
    words = topic.split(' ')

    for j in range(len(words), 1, -1):
        test = '_'.join(words[:j])
        if test in model:
            return [test] + process_word_rec(' '.join(words[j:]))

    if words[0] in model:
        output.append(words[0])
    return output + process_word_rec(' '.join(words[1:]))