The data file `terms.tsv` has 10K elements from a much larger file,
with the keyphrases from 843 unique "documents" represented.
Realistically, you want many more "documents" in a Word2Vec model
before its results begin to make sense.

This is enough to show how to call the functions from `gensim`.

In [None]:
import csv
import gensim
import logging
import sys

model_file = "model.dat"
term_path = "terms.tsv"

In [None]:
sentences = []
sent = []
last_chap = None

with open(term_path) as f:
    for term, chap, rank in csv.reader(f, delimiter="\t"):
        rank = float(rank)

        if chap != last_chap:
            if last_chap:
                sentences.append(sent)
                sent = []

            last_chap = chap

        sent.append(term)

    # handle the dangling last element
    sentences.append(sent)

print(len(sentences))

In [None]:
# set up logging, train word2vec on the sentences
FORMAT = "%(asctime)s : %(levelname)s : %(message)"
logging.basicConfig(format=FORMAT, level=logging.ERROR)

# train a model, then save it
model = gensim.models.Word2Vec(sentences, min_count=1)
model.save(model_file)

In [None]:
def get_synset (model, query, topn=10):
    try:
        return sorted(model.most_similar(positive=[query], topn=topn), key=lambda x: x[1], reverse=True)
    except KeyError:
        return []

In [None]:
# if you need to load a trained model:
# model = gensim.models.Word2Vec.load(MODEL_FILE)

# query the model through a mini REPL
while True:
    try:
        query = input("\nquery? ")
        synset = get_synset(model, query, topn=10)
        print("most similar to", query, ":", synset)
    except KeyError:
        print("not found")