# Preliminary Scripts for GloVe Embeddings

Author: Brandon Fan

In [1]:
import os
import json

## Load Data

In [2]:
bible_data = json.load(open('../bible-files/english-web-bible.json', encoding='utf-8-sig'))

In [3]:
verse_data = []
for book in bible_data:
    for chapter in book['data']:
        for verses in chapter['verses']:
           verse_data.append(verses)

## Preprocess Text

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [5]:
stopwords = set(stopwords.words('english'))
exclude = set(string.punctuation)
def tokenize_data(verse_data):
    for verse in verse_data:
        text = ''.join(ch for ch in verse['text'] if ch not in exclude)
        tokenized_text = word_tokenize(text)
        final_text = []
        for val in tokenized_text:
            if val not in stopwords:
                if 'Yahweh' in val:
                    val = val.replace('Yahweh', 'God')
                final_text.append(val)
        verse['tokenized_text'] = final_text
tokenize_data(verse_data)

## GloVe Import

In [6]:
import pandas as pd
import csv

In [7]:
glove_data_file = './glove.6B/glove.6B.200d.txt'

In [8]:
words = pd.read_table(glove_data_file, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [9]:
def vec(w):
  return words.loc[w].as_matrix()

In [16]:
vec('okay').shape

(200,)

## Convert Words to Vectors

In [17]:
import numpy as np

In [20]:
MAX_LEN = 24

In [30]:
verse_data_with_glove = []
for verse in verse_data:
    vector = np.array([])
    for word in verse['tokenized_text'][:MAX_LEN]:
        try:
            vector = np.append(vector, vec(word))
        except Exception:
            vector = np.append(vector, np.zeros(200))
    if vector.shape[0] < MAX_LEN * 200:
        vector = np.append(vector, np.zeros([MAX_LEN * 200 - vector.shape[0]]))
    verse['vector'] = vector
    verse_data_with_glove.append(verse)
len(verse_data_with_glove)

31103

In [31]:
verse_data_with_glove[0]['vector'].shape[0]

4800

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

In [33]:
cos_sim_matrix = cosine_similarity([verse['vector'] for verse in verse_data_with_glove])

In [37]:
verse_data[np.argmax(cos_sim_matrix[0][1:])]

{'text': 'For I know that Yahweh is great,\nthat our Lord is above all gods.',
 'tokenized_text': ['For', 'I', 'know', 'God', 'great', 'Lord', 'gods'],
 'vector': array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 'verse': 'Psalms 135:5',
 'verse_number': '5'}

# Get Similar Values

In [65]:
def get_similar_values(verse, total_values=10):
    proper_index = None
    for index, verse_text in enumerate(verse_data):
        if verse_text['verse'] == verse:
            proper_index = index
            break
    sim_text = cos_sim_matrix[proper_index][1:]
    final_text = []
    final_indices = list(reversed(np.argsort(sim_text)))[:total_values]
    for index in final_indices:
        final_text.append(verse_data[index])
    return final_text

In [69]:
similar_values = get_similar_values('Genesis 1:1', total_values=10)
for value in similar_values:
    print(value['verse'] + ': ' + value['text'] + '\n\n')

Psalms 135:5: For I know that Yahweh is great,
that our Lord is above all gods.


Psalms 33:13: Yahweh looks from heaven.
He sees all the sons of men.


Psalms 105:6: you offspring of Abraham, his servant,
you children of Jacob, his chosen ones.


1 Chronicles 16:13: you offspring of Israel his servant,
you children of Jacob, his chosen ones.


Job 8:8: “Please inquire of past generations.
Find out about the learning of their fathers.


John 1:1: In the beginning was the Word, and the Word was with God, and the Word was God.


Luke 24:53: and were continually in the temple, praising and blessing God. Amen.


Matthew 28:20: teaching them to observe all things that I commanded you. Behold, I am with you always, even to the end of the age.” Amen.


Psalms 34:15: Yahweh’s eyes are toward the righteous.
His ears listen to their cry.


Genesis 9:18: The sons of Noah who went out from the ship were Shem, Ham, and Japheth. Ham is the father of Canaan.




## Finetuning GloVe for Bible Text

In [113]:
import gensim
from nltk.tokenize import RegexpTokenizer

In [103]:
f = open(glove_data_file, encoding='utf8')
lines = f.readlines()
f.close()

In [115]:
f_write = open('glove_gensim.model', 'w', encoding='utf8')

In [116]:
dimension_string = str(len(lines)) + ' ' + glove_data_file.split('.')[-2].split('d')[0]

In [117]:
f_write.write(dimension_string)
f_write.write('\n')
for i in lines:
     f_write.write(i)
f_write.close()

In [118]:
Word2vec_keys = []
with open( 'glove_gensim.model', 'r', encoding='utf8') as f:
    for line in f:
        line = line.split()
        # word and vec
        Word2vec_keys.append( line[0] )

tokenizer = RegexpTokenizer( r"\w+|[^\w\s]" ) # include punctunation

In [124]:
model = gensim.models.Word2Vec( [ Word2vec_keys ], size=200, alpha=0.0001, workers=10, 
                               min_alpha=1e-6, iter=55, min_count=0 )

In [125]:
corpora = [verse['tokenized_text'] for verse in verse_data]

In [126]:
model.build_vocab(corpora, update=True)

In [127]:
model.train( corpora, total_examples = model.corpus_count, epochs = model.iter )

18653597

In [129]:
with open( 'glove_gensim_finetuned.txt', 'w+', encoding='utf8' ) as F:
    for key in model.wv.vocab.keys():
        vec = model[ key ]
        vec = [ str( float( item ) ) for item in list( vec ) ]
        line = key + ' ' + ' '.join( list( vec ) ) + '\n'
        F.write( line )
    F.close()

In [130]:
finetuned_words = pd.read_table('glove_gensim_finetuned.txt', sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [131]:
def vec_finetuned(w):
  return finetuned_words.loc[w].as_matrix()

In [132]:
verse_data_with_glove_finetuned = []
for verse in verse_data:
    vector = np.array([])
    for word in verse['tokenized_text'][:MAX_LEN]:
        try:
            vector = np.append(vector, vec(word))
        except Exception:
            vector = np.append(vector, np.zeros(200))
    if vector.shape[0] < MAX_LEN * 200:
        vector = np.append(vector, np.zeros([MAX_LEN * 200 - vector.shape[0]]))
    verse['vector'] = vector
    verse_data_with_glove_finetuned.append(verse)
len(verse_data_with_glove_finetuned)

31103

In [133]:
cos_sim_matrix_finetuned = cosine_similarity([verse['vector'] for verse in verse_data_with_glove_finetuned])

In [134]:
def get_similar_values_finetuned(verse, total_values=10):
    proper_index = None
    for index, verse_text in enumerate(verse_data):
        if verse_text['verse'] == verse:
            proper_index = index
            break
    sim_text = cos_sim_matrix_finetuned[proper_index][1:]
    final_text = []
    final_indices = list(reversed(np.argsort(sim_text)))[:total_values]
    for index in final_indices:
        final_text.append(verse_data[index])
    return final_text

In [135]:
similar_values = get_similar_values('Genesis 1:1', total_values=10)
for value in similar_values:
    print(value['verse'] + ': ' + value['text'] + '\n\n')

Psalms 135:5: For I know that Yahweh is great,
that our Lord is above all gods.


Psalms 33:13: Yahweh looks from heaven.
He sees all the sons of men.


Psalms 105:6: you offspring of Abraham, his servant,
you children of Jacob, his chosen ones.


1 Chronicles 16:13: you offspring of Israel his servant,
you children of Jacob, his chosen ones.


Job 8:8: “Please inquire of past generations.
Find out about the learning of their fathers.


John 1:1: In the beginning was the Word, and the Word was with God, and the Word was God.


Luke 24:53: and were continually in the temple, praising and blessing God. Amen.


Matthew 28:20: teaching them to observe all things that I commanded you. Behold, I am with you always, even to the end of the age.” Amen.


Psalms 34:15: Yahweh’s eyes are toward the righteous.
His ears listen to their cry.


Genesis 9:18: The sons of Noah who went out from the ship were Shem, Ham, and Japheth. Ham is the father of Canaan.


