# Preliminary Scripts for GloVe Embeddings

Author: Brandon Fan

In [3]:
import os
import json

## Load Data

In [4]:
bible_data = json.load(open('../bible-files/english-web-bible.json', encoding='utf-8-sig'))

In [5]:
verse_data = []
for book in bible_data:
    for chapter in book['data']:
        for verses in chapter['verses']:
           verse_data.append(verses)

## Preprocess Text

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [7]:
stopwords = set(stopwords.words('english'))
exclude = set(string.punctuation)
def tokenize_data(verse_data):
    for verse in verse_data:
        text = ''.join(ch for ch in verse['text'] if ch not in exclude)
        tokenized_text = word_tokenize(text)
        final_text = []
        for val in tokenized_text:
            if val not in stopwords:
                if 'Yahweh' in val:
                    val = val.replace('Yahweh', 'God')
                final_text.append(val)
        verse['tokenized_text'] = final_text
tokenize_data(verse_data)

## GloVe Import

In [8]:
import pandas as pd
import csv
import numpy as np

In [9]:
glove_data_file = '../server/files/glove.6B.200d.txt'

In [10]:
%%time
glove_data = {}
with open(glove_data_file) as f:
    for line in list(f.readlines()):
        split = line.split()
        word = split[0]
        data = np.array([float(i) for i in split[1:]])
        glove_data[word] = data

CPU times: user 26.4 s, sys: 813 ms, total: 27.2 s
Wall time: 27.2 s


In [11]:
def vec(w):
  return glove_data[w]

In [12]:
vec('okay').shape

(200,)

In [13]:
import sys
print('Size of Dictionary: ' + str(sys.getsizeof(glove_data) / 1000000) + ' mb')

Size of Dictionary: 20.971616 mb


## Convert Words to Vectors

In [14]:
import numpy as np

In [15]:
MAX_LEN = 24

In [16]:
verse_data_with_glove = []
for verse in verse_data:
    vector = np.array([])
    for word in verse['tokenized_text'][:MAX_LEN]:
        try:
            vector = np.append(vector, vec(word))
        except Exception:
            vector = np.append(vector, np.zeros(200))
    if vector.shape[0] < MAX_LEN * 200:
        vector = np.append(vector, np.zeros([MAX_LEN * 200 - vector.shape[0]]))
    verse['vector'] = vector
    verse_data_with_glove.append(verse)
len(verse_data_with_glove)

31103

In [17]:
verse_data_with_glove[0]['vector'].shape[0]

4800

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
import sklearn.preprocessing as pp
from scipy import sparse
verse_sparse = sparse.coo_matrix([verse['vector'] for verse in verse_data_with_glove])

In [None]:
%%time
cos_sim_matrix = cosine_similarity(verse_sparse, dense_output=False)

In [62]:
verse_data[np.argmax(cos_sim_matrix[0][1:])]

{'text': 'For I know that Yahweh is great,\nthat our Lord is above all gods.',
 'tokenized_text': ['For', 'I', 'know', 'God', 'great', 'Lord', 'gods'],
 'vector': array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 'verse': 'Psalms 135:5',
 'verse_number': '5'}

In [63]:
sys.getsizeof(cos_sim_matrix) / 1000000

7739.172984

# Get Similar Values

In [37]:
def get_similar_values(verse, total_values=10):
    proper_index = None
    for index, verse_text in enumerate(verse_data):
        if verse_text['verse'] == verse:
            proper_index = index
            break
    sim_text = cos_sim_matrix[proper_index][1:]
    final_text = []
    final_indices = list(reversed(np.argsort(sim_text)))[:total_values]
    for index in final_indices:
        final_text.append(verse_data[index])
    return final_text

In [38]:
similar_values = get_similar_values('Genesis 1:1', total_values=10)
for value in similar_values:
    print(value['verse'] + ': ' + value['text'] + '\n\n')

Psalms 135:5: For I know that Yahweh is great,
that our Lord is above all gods.


Psalms 33:13: Yahweh looks from heaven.
He sees all the sons of men.


Psalms 105:6: you offspring of Abraham, his servant,
you children of Jacob, his chosen ones.


1 Chronicles 16:13: you offspring of Israel his servant,
you children of Jacob, his chosen ones.


Job 8:8: “Please inquire of past generations.
Find out about the learning of their fathers.


John 1:1: In the beginning was the Word, and the Word was with God, and the Word was God.


Luke 24:53: and were continually in the temple, praising and blessing God. Amen.


Matthew 28:20: teaching them to observe all things that I commanded you. Behold, I am with you always, even to the end of the age.” Amen.


Psalms 34:15: Yahweh’s eyes are toward the righteous.
His ears listen to their cry.


Genesis 9:18: The sons of Noah who went out from the ship were Shem, Ham, and Japheth. Ham is the father of Canaan.




In [42]:
from sklearn.externals import joblib
joblib.dump(cos_sim_matrix, 'sim_lookup.pkl')

['sim_lookup.pkl']

## Finetuning GloVe for Bible Text

In [6]:
import gensim
from nltk.tokenize import RegexpTokenizer

Using TensorFlow backend.


In [103]:
f = open(glove_data_file, encoding='utf8')
lines = f.readlines()
f.close()

In [115]:
f_write = open('glove_gensim.model', 'w', encoding='utf8')

In [116]:
dimension_string = str(len(lines)) + ' ' + glove_data_file.split('.')[-2].split('d')[0]

In [117]:
f_write.write(dimension_string)
f_write.write('\n')
for i in lines:
     f_write.write(i)
f_write.close()

In [118]:
Word2vec_keys = []
with open( 'glove_gensim.model', 'r', encoding='utf8') as f:
    for line in f:
        line = line.split()
        # word and vec
        Word2vec_keys.append( line[0] )

tokenizer = RegexpTokenizer( r"\w+|[^\w\s]" ) # include punctunation

In [124]:
model = gensim.models.Word2Vec( [ Word2vec_keys ], size=200, alpha=0.0001, workers=10, 
                               min_alpha=1e-6, iter=55, min_count=0 )

In [125]:
corpora = [verse['tokenized_text'] for verse in verse_data]

In [126]:
model.build_vocab(corpora, update=True)

In [127]:
model.train( corpora, total_examples = model.corpus_count, epochs = model.iter )

18653597

In [129]:
with open( 'glove_gensim_finetuned.txt', 'w+', encoding='utf8' ) as F:
    for key in model.wv.vocab.keys():
        vec = model[ key ]
        vec = [ str( float( item ) ) for item in list( vec ) ]
        line = key + ' ' + ' '.join( list( vec ) ) + '\n'
        F.write( line )
    F.close()

In [7]:
finetuned_words = pd.read_table('glove_gensim_finetuned.txt', sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [8]:
def vec_finetuned(w):
  return finetuned_words.loc[w].as_matrix()

In [16]:
verse_data_with_glove_finetuned = []
for verse in verse_data:
    vector = np.array([])
    for word in verse['tokenized_text'][:MAX_LEN]:
        try:
            vector = np.append(vector, vec(word))
        except Exception:
            vector = np.append(vector, np.zeros(200))
    if vector.shape[0] < MAX_LEN * 200:
        vector = np.append(vector, np.zeros([MAX_LEN * 200 - vector.shape[0]]))
    verse['vector'] = vector
    verse_data_with_glove_finetuned.append(verse)
len(verse_data_with_glove_finetuned)

31103

In [17]:
cos_sim_matrix_finetuned = cosine_similarity([verse['vector'] for verse in verse_data_with_glove_finetuned])

In [18]:
from sklearn.externals import joblib
joblib.dump(cos_sim_matrix_finetuned, 'sim_lookup.pkl')

['sim_lookup.pkl']

In [21]:
def get_similar_values_finetuned(verse, total_values=10):
    proper_index = None
    for index, verse_text in enumerate(verse_data):
        if verse_text['verse'] == verse:
            proper_index = index
            break
    sim_text = cos_sim_matrix_finetuned[proper_index][1:]
    final_text = []
    final_indices = list(reversed(np.argsort(sim_text)))[:total_values]
    for index in final_indices:
        final_text.append(verse_data[index])
    return final_text

In [40]:
similar_values = get_similar_values_finetuned('Genesis 1:1', total_values=10)
for value in similar_values:
    print(value['verse'] + ': ' + value['text'] + '\n\n')

Revelation 22:20: He who testifies these things says,
“Yes, I come quickly.”

Amen! Yes, come, Lord Jesus.


1 Chronicles 4:1: The sons of Judah: Perez, Hezron, Carmi, Hur, and Shobal.


1 Chronicles 2:52: Shobal the father of Kiriath Jearim had sons: Haroeh, half of the Menuhoth.


1 Chronicles 2:53: The families of Kiriath Jearim: the Ithrites, the Puthites, the Shumathites, and the Mishraites; from them came the Zorathites and the Eshtaolites.


1 Chronicles 2:54: The sons of Salma: Bethlehem, the Netophathites, Atroth Beth Joab, and half of the Manahathites, the Zorites.


1 Chronicles 2:55: The families of scribes who lived at Jabez: the Tirathites, the Shimeathites, and the Sucathites. These are the Kenites who came from Hammath, the father of the house of Rechab.


1 Chronicles 3:1: Now these were the sons of David, who were born to him in Hebron: the firstborn, Amnon, of Ahinoam the Jezreelitess; the second, Daniel, of Abigail the Carmelitess;


1 Chronicles 3:2: the third, Abs