# Preliminary Scripts for Doc2Vec Algorithm
Author: Brandon Fan

In [1]:
# import packages
import os
import json

## Load Data

In [2]:
bible_data = json.load(open('../bible-files/english-web-bible.json', encoding='utf-8-sig'))

In [3]:
verse_data = []
for book in bible_data:
    for chapter in book['data']:
        for verses in chapter['verses']:
           verse_data.append(verses)

In [4]:
for val in verse_data:
    if val['text'] is None:
        print(val)
        

## Preprocess Text

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [6]:
stopwords = set(stopwords.words('english'))
exclude = set(string.punctuation)
def tokenize_data(verse_data):
    for verse in verse_data:
        text = ''.join(ch for ch in verse['text'] if ch not in exclude)
        tokenized_text = word_tokenize(text)
        final_text = []
        for val in tokenized_text:
            if val not in stopwords:
                if 'Yahweh' in val:
                    val = val.replace('Yahweh', 'God')
                final_text.append(val)
        verse['tokenized_text'] = final_text
tokenize_data(verse_data)

## Doc2Vec

In [7]:
import gensim

Using TensorFlow backend.


In [8]:
def create_corpus(verse_tokenized_data, use_nltk=False):
    for verse in verse_data:
        if use_nltk:
            words = verse['tokenized_text']
            yield gensim.models.doc2vec.TaggedDocument(words, [verse['verse']])
        else:
            words = verse['text']
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(words), [verse['verse']])

In [9]:
tagged_docs = list(create_corpus(verse_data, use_nltk=False))

In [10]:
model = gensim.models.Doc2Vec(size=20, iter=100, workers=11, min_count=4, window=4, alpha=0.0001, min_alpha=1e-6)

In [11]:
model.build_vocab(tagged_docs)

In [12]:
%time model.train(tagged_docs, total_examples=model.corpus_count, epochs=model.iter)
model.save('doc2vec.model')

Wall time: 1min 47s


In [13]:
similar = model.docvecs.most_similar([model.infer_vector(verse_data[151]['text'])], topn=10)
similar

[('1 John 2:13', 0.7930722832679749),
 ('John 19:13', 0.7754273414611816),
 ('1 Timothy 4:14', 0.7729899883270264),
 ('Hebrews 5:12', 0.7543457746505737),
 ('1 Corinthians 15:24', 0.754139244556427),
 ('Psalms 119:86', 0.7347621321678162),
 ('Luke 3:10', 0.7343525886535645),
 ('Ezekiel 20:3', 0.7325412034988403),
 ('Jeremiah 49:17', 0.7265909910202026),
 ('Nehemiah 12:19', 0.7236027121543884)]

In [14]:
similar_tags = [val[0] for val in similar]
print('Actual: ' + verse_data[151]['text'] + '\n')
for tag in similar_tags:
    for verse in verse_data:
        if verse['verse'] == tag:
            print(tag + ':', verse['text'] + '\n\n')

Actual: Make a ship of gopher wood. You shall make rooms in the ship, and shall seal it inside and outside with pitch.

1 John 2:13: I write to you, fathers, because you know him who is from the beginning.
I write to you, young men, because you have overcome the evil one.
I write to you, little children, because you know the Father.


John 19:13: When Pilate therefore heard these words, he brought Jesus out, and sat down on the judgment seat at a place called “The Pavement”, but in Hebrew, “Gabbatha.”


1 Timothy 4:14: Don’t neglect the gift that is in you, which was given to you by prophecy, with the laying on of the hands of the elders.


Hebrews 5:12: For although by this time you should be teachers, you again need to have someone teach you the rudiments of the first principles of the revelations of God. You have come to need milk, and not solid food.


1 Corinthians 15:24: Then the end comes, when he will deliver up the Kingdom to God, even the Father; when he will have abolished a

## TF-IDF and Cosine Similarity

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [16]:
vectorizer = TfidfVectorizer(stop_words='english')

In [17]:
term_idf = vectorizer.fit_transform([verse['text'] for verse in verse_data])

In [18]:
cosine_similarity_matrix = cosine_similarity(term_idf)

In [19]:
x = list(reversed(np.argsort(cosine_similarity_matrix[0])))
print('Actual: ' + verse_data[0]['text'], '\n\nClosest: ' + verse_data[x[1]]['text'])
score = list(reversed(sorted(cosine_similarity_matrix[0])))[1]
print('Score: ' + str(score))

Actual: In the beginning, God created the heavens and the earth. 

Closest: This is the history of the generations of the heavens and of the earth when they were created, in the day that Yahweh God made the earth and the heavens.
Score: 0.678020989269


## Combined

In [20]:
def get_similar_values(tokenized_text, total_values=10):
    inferred_vector = model.infer_vector(tokenized_text)
    most_similar = model.docvecs.most_similar([inferred_vector], topn=total_values + 10)
    text = [''.join(tokenized_text)]
    for verse_name, _ in most_similar:
        for verse in verse_data:
            if verse['verse'] == verse_name:
                text.append(verse['text'])
    vectorizer = TfidfVectorizer(stop_words='english')
    tf_idf_transform = vectorizer.fit_transform(text)
    cos_sim_matrix = cosine_similarity(tf_idf_transform)
    sim_text = cos_sim_matrix[0][1:]
    final_text = []
    final_indices = list(reversed(np.argsort(sim_text)))[:10]
    for index in final_indices:
        final_text.append(text[index])
    return final_text

In [21]:
results = get_similar_values(verse_data[151]['text'])
print('Actual: ' + verse_data[151]['text'])
for val in results:
    print(val + '\n\n')

Actual: Make a ship of gopher wood. You shall make rooms in the ship, and shall seal it inside and outside with pitch.
for if God didn’t spare the natural branches, neither will he spare you.


He was despised,
and rejected by men;
a man of suffering,
and acquainted with disease.
He was despised as one from whom men hide their face;
and we didn’t respect him.


“Son of man, speak to the elders of Israel, and tell them, ‘Thus says the Lord Yahweh: “Is it to inquire of me that you have come? As I live,” says the Lord Yahweh, “I will not be inquired of by you.”’


I write to you, fathers, because you know him who is from the beginning.
I write to you, young men, because you have overcome the evil one.
I write to you, little children, because you know the Father.


When Pilate therefore heard these words, he brought Jesus out, and sat down on the judgment seat at a place called “The Pavement”, but in Hebrew, “Gabbatha.”


Don’t neglect the gift that is in you, which was given to you by pro