# Preliminary Scripts for Doc2Vec Algorithm
Author: Brandon Fan

In [1]:
# import packages
import os
import json

## Load Data

In [2]:
bible_data = json.load(open('../bible-files/english-web-bible.json', encoding='utf-8-sig'))

In [3]:
verse_data = []
for book in bible_data:
    for chapter in book['data']:
        for verses in chapter['verses']:
           verse_data.append(verses)

In [4]:
for val in verse_data:
    if val['text'] is None:
        print(val)
        

## Preprocess Text

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [6]:
stopwords = set(stopwords.words('english'))
exclude = set(string.punctuation)
def tokenize_data(verse_data):
    for verse in verse_data:
        text = ''.join(ch for ch in verse['text'] if ch not in exclude)
        tokenized_text = word_tokenize(text)
        final_text = []
        for val in tokenized_text:
            if val not in stopwords:
                if 'Yahweh' in val:
                    val = val.replace('Yahweh', 'God')
                final_text.append(val)
        verse['tokenized_text'] = final_text
tokenize_data(verse_data)

## Doc2Vec

In [7]:
import gensim

Using TensorFlow backend.


In [53]:
def create_corpus(verse_tokenized_data, use_nltk=False):
    for verse in verse_data:
        if use_nltk:
            words = verse['tokenized_text']
            yield gensim.models.doc2vec.TaggedDocument(words, [verse['verse']])
        else:
            words = verse['text']
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(words), [verse['verse']])

In [76]:
tagged_docs = list(create_corpus(verse_data, use_nltk=False))

In [82]:
model = gensim.models.Doc2Vec(size=20, iter=100, workers=11, min_count=4, window=4, alpha=0.0001, min_alpha=1e-6)

In [83]:
model.build_vocab(tagged_docs)

In [84]:
%time model.train(tagged_docs, total_examples=model.corpus_count, epochs=model.iter)

Wall time: 1min 52s


51933964

In [87]:
similar = model.docvecs.most_similar([model.infer_vector(verse_data[151]['text'])], topn=10)
similar

[('Psalms 52:8', 0.7630136013031006),
 ('Exodus 6:24', 0.7618410587310791),
 ('Esther 1:5', 0.7202386856079102),
 ('1 Kings 8:41', 0.7128696441650391),
 ('Philippians 2:3', 0.7122591733932495),
 ('Numbers 20:29', 0.7117135524749756),
 ('Galatians 3:9', 0.7078370451927185),
 ('1 John 5:3', 0.7066912651062012),
 ('Esther 9:30', 0.7063077092170715),
 ('Joshua 19:47', 0.7043663263320923)]

In [90]:
similar_tags = [val[0] for val in similar]
print('Actual: ' + verse_data[151]['text'] + '\n')
for tag in similar_tags:
    for verse in verse_data:
        if verse['verse'] == tag:
            print(tag + ':', verse['text'] + '\n\n')

Actual: Make a ship of gopher wood. You shall make rooms in the ship, and shall seal it inside and outside with pitch.

Psalms 52:8: But as for me, I am like a green olive tree in God’s house.
I trust in God’s loving kindness forever and ever.


Exodus 6:24: The sons of Korah: Assir, and Elkanah, and Abiasaph; these are the families of the Korahites.


Esther 1:5: When these days were fulfilled, the king made a seven day feast for all the people who were present in Shushan the palace, both great and small, in the court of the garden of the king’s palace.


1 Kings 8:41: “Moreover concerning the foreigner, who is not of your people Israel, when he comes out of a far country for your name’s sake


Philippians 2:3: doing nothing through rivalry or through conceit, but in humility, each counting others better than himself;


Numbers 20:29: When all the congregation saw that Aaron was dead, they wept for Aaron thirty days, even all the house of Israel.


Galatians 3:9: So then, those who ar

## TF-IDF and Cosine Similarity

In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [106]:
vectorizer = TfidfVectorizer(stop_words='english')

In [107]:
term_idf = vectorizer.fit_transform([verse['text'] for verse in verse_data])

In [108]:
cosine_similarity_matrix = cosine_similarity(term_idf)

In [109]:
x = list(reversed(np.argsort(cosine_similarity_matrix[0])))
print('Actual: ' + verse_data[0]['text'], '\n\nClosest: ' + verse_data[x[1]]['text'])
score = list(reversed(sorted(cosine_similarity_matrix[0])))[1]
print('Score: ' + str(score))

Actual: In the beginning, God created the heavens and the earth. 

Closest: This is the history of the generations of the heavens and of the earth when they were created, in the day that Yahweh God made the earth and the heavens.
Score: 0.678020989269


## Combined

In [132]:
def get_similar_values(tokenized_text, total_values=10):
    inferred_vector = model.infer_vector(tokenized_text)
    most_similar = model.docvecs.most_similar([inferred_vector], topn=total_values + 10)
    text = [''.join(tokenized_text)]
    for verse_name, _ in most_similar:
        for verse in verse_data:
            if verse['verse'] == verse_name:
                text.append(verse['text'])
    vectorizer = TfidfVectorizer(stop_words='english')
    tf_idf_transform = vectorizer.fit_transform(text)
    cos_sim_matrix = cosine_similarity(tf_idf_transform)
    sim_text = cos_sim_matrix[0][1:]
    final_text = []
    final_indices = list(reversed(np.argsort(sim_text)))[:10]
    for index in final_indices:
        final_text.append(text[index])
    return final_text

In [134]:
results = get_similar_values(verse_data[151]['text'])
print('Actual: ' + verse_data[151]['text'])
for val in results:
    print(val + '\n\n')

Actual: Make a ship of gopher wood. You shall make rooms in the ship, and shall seal it inside and outside with pitch.
Jesus said to them,
“If you were blind, you would have no sin; but now you say, ‘We see.’ Therefore your sin remains.


Let the lying lips be mute,
which speak against the righteous insolently, with pride and contempt.


But as for me, I am like a green olive tree in God’s house.
I trust in God’s loving kindness forever and ever.


The sons of Korah: Assir, and Elkanah, and Abiasaph; these are the families of the Korahites.


When these days were fulfilled, the king made a seven day feast for all the people who were present in Shushan the palace, both great and small, in the court of the garden of the king’s palace.


“Moreover concerning the foreigner, who is not of your people Israel, when he comes out of a far country for your name’s sake


doing nothing through rivalry or through conceit, but in humility, each counting others better than himself;


When all the con