# Preliminary Scripts for Doc2Vec Algorithm
Author: Brandon Fan

In [1]:
# import packages
import os
import json

## Load Data

In [2]:
bible_data = json.load(open('../bible-files/english-web-bible.json', encoding='utf-8-sig'))

In [3]:
verse_data = []
for book in bible_data:
    for chapter in book['data']:
        for verses in chapter['verses']:
           verse_data.append(verses)

In [4]:
for val in verse_data:
    if val['text'] is None:
        print(val)
        

## Preprocess Text

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [6]:
stopwords = set(stopwords.words('english'))
exclude = set(string.punctuation)
def tokenize_data(verse_data):
    for verse in verse_data:
        text = ''.join(ch for ch in verse['text'] if ch not in exclude)
        tokenized_text = word_tokenize(text)
        final_text = []
        for val in tokenized_text:
            if val not in stopwords:
                if 'Yahweh' in val:
                    val = val.replace('Yahweh', 'God')
                final_text.append(val)
        verse['tokenized_text'] = final_text
tokenize_data(verse_data)

## Doc2Vec

In [7]:
import gensim

Using TensorFlow backend.


In [8]:
def create_corpus(verse_tokenized_data, use_nltk=False):
    for verse in verse_data:
        if use_nltk:
            words = verse['tokenized_text']
            yield gensim.models.doc2vec.TaggedDocument(words, [verse['verse']])
        else:
            words = verse['text']
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(words), [verse['verse']])

In [9]:
tagged_docs = list(create_corpus(verse_data, use_nltk=False))

In [10]:
model = gensim.models.Doc2Vec(size=20, iter=100, workers=11, min_count=4, window=4, alpha=0.0001, min_alpha=1e-6)

In [11]:
model.build_vocab(tagged_docs)

In [12]:
# %time model.train(tagged_docs, total_examples=model.corpus_count, epochs=model.iter)
# model.save('doc2vec.model')

In [14]:
model = gensim.models.Doc2Vec.load('doc2vec.model')

In [22]:
similar = model.docvecs.most_similar([model.infer_vector(verse_data[151]['text'])], topn=10)
similar

[('Exodus 22:8', 0.8269375562667847),
 ('Micah 5:3', 0.8139984011650085),
 ('Nehemiah 9:14', 0.7705897092819214),
 ('Isaiah 28:14', 0.7628965973854065),
 ('John 13:17', 0.7604496479034424),
 ('Psalms 31:7', 0.7244011163711548),
 ('Acts 1:1', 0.7243518233299255),
 ('Deuteronomy 10:10', 0.7184383273124695),
 ('Proverbs 8:26', 0.7064456939697266),
 ('Judges 21:18', 0.6994155645370483)]

In [23]:
similar_tags = [val[0] for val in similar]
print('Actual: ' + verse_data[151]['text'] + '\n')
for tag in similar_tags:
    for verse in verse_data:
        if verse['verse'] == tag:
            print(tag + ':', verse['text'] + '\n\n')

Actual: Make a ship of gopher wood. You shall make rooms in the ship, and shall seal it inside and outside with pitch.

Exodus 22:8: If the thief isn’t found, then the master of the house shall come near to God, to find out if he hasn’t put his hand to his neighbor’s goods.


Micah 5:3: Therefore he will abandon them until the time that she who is in labor gives birth.
Then the rest of his brothers will return to the children of Israel.


Nehemiah 9:14: and made known to them your holy Sabbath, and commanded them commandments, statutes, and a law, by Moses your servant,


Isaiah 28:14: Therefore hear Yahweh’s word, you scoffers, that rule this people in Jerusalem:


John 13:17: If you know these things, blessed are you if you do them.


Psalms 31:7: I will be glad and rejoice in your loving kindness,
for you have seen my affliction.
You have known my soul in adversities.


Acts 1:1: The first book I wrote, Theophilus, concerned all that Jesus began both to do and to teach,


Deuteronom

## TF-IDF and Cosine Similarity

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [25]:
vectorizer = TfidfVectorizer(stop_words='english')

In [26]:
term_idf = vectorizer.fit_transform([verse['text'] for verse in verse_data])

In [27]:
cosine_similarity_matrix = cosine_similarity(term_idf)

In [28]:
x = list(reversed(np.argsort(cosine_similarity_matrix[0])))
print('Actual: ' + verse_data[0]['text'], '\n\nClosest: ' + verse_data[x[1]]['text'])
score = list(reversed(sorted(cosine_similarity_matrix[0])))[1]
print('Score: ' + str(score))

Actual: In the beginning, God created the heavens and the earth. 

Closest: This is the history of the generations of the heavens and of the earth when they were created, in the day that Yahweh God made the earth and the heavens.
Score: 0.678020989269


## Combined

In [33]:
def get_similar_values(tokenized_text, total_values=10):
    inferred_vector = model.infer_vector(tokenized_text)
    most_similar = model.docvecs.most_similar([inferred_vector], topn=total_values + 10)
    text = [''.join(tokenized_text)]
    for verse_name, _ in most_similar:
        for verse in verse_data:
            if verse['verse'] == verse_name:
                text.append(verse['text'])
    vectorizer = TfidfVectorizer(stop_words='english')
    tf_idf_transform = vectorizer.fit_transform(text)
    cos_sim_matrix = cosine_similarity(tf_idf_transform)
    sim_text = cos_sim_matrix[0][1:]
    final_text = []
    final_indices = list(reversed(np.argsort(sim_text)))[:10]
    for index in final_indices:
        final_text.append(text[index])
    return final_text

In [34]:
results = get_similar_values(verse_data[200]['text'])
print('Actual: ' + verse_data[200]['text'] + '\n')
for val in results:
    print(val + '\n\n')

<class 'str'>
Actual: Bring out with you every living thing that is with you of all flesh, including birds, livestock, and every creeping thing that creeps on the earth, that they may breed abundantly in the earth, and be fruitful, and multiply on the earth.”

Is any among you suffering? Let him pray. Is any cheerful? Let him sing praises.


Then you will begin to say, ‘We ate and drank in your presence, and you taught in our streets.’


The kingdom and the dominion, and the greatness of the kingdoms under the whole sky, will be given to the people of the saints of the Most High. His kingdom is an everlasting kingdom, and all dominions will serve and obey him.’


Also Bakbukiah and Unno, their brothers, were close to them according to their offices.


For you have delivered my soul from death,
and prevented my feet from falling,
that I may walk before God in the light of the living.


(and he commanded them to teach the children of Judah the song of the bow; behold, it is written in th