In [1]:
import json 

with open("kjv.json") as f:
    kjv = json.load(f)

chapters = list(kjv['KingJamesVersion'].keys())

In [2]:
def make_corpus(book, chapters):
    corpus = []
    for chapter in chapters:
        text = book[chapter][1]
        corpus.append('\n'.join(text[1:]))
    return corpus

In [3]:
print(make_corpus(kjv['KingJamesVersion'], chapters[:1])[0])

Adam, Sheth, Enosh,
Kenan, Mahalaleel, Jered,
Henoch, Methuselah, Lamech,
Noah, Shem, Ham, and Japheth.
The sons of Japheth; Gomer, and Magog, and Madai, and Javan, and Tubal, and Meshech, and Tiras.
And the sons of Gomer; Ashchenaz, and Riphath, and Togarmah.
And the sons of Javan; Elishah, and Tarshish, Kittim, and Dodanim.
The sons of Ham; Cush, and Mizraim, Put, and Canaan.
And the sons of Cush; Seba, and Havilah, and Sabta, and Raamah, and Sabtecha. And the sons of Raamah; Sheba, and Dedan.
And Cush begat Nimrod: he began to be mighty upon the earth.
And Mizraim begat Ludim, and Anamim, and Lehabim, and Naphtuhim,
And Pathrusim, and Casluhim, (of whom came the Philistines,) and Caphthorim.
And Canaan begat Zidon his firstborn, and Heth,
The Jebusite also, and the Amorite, and the Girgashite,
And the Hivite, and the Arkite, and the Sinite,
And the Arvadite, and the Zemarite, and the Hamathite.
The sons of Shem; Elam, and Asshur, and Arphaxad, and Lud, and Aram, and Uz, and Hul, and

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

vectorizer = TfidfVectorizer()

class Documents:
    def __init__(self, corpus):
        
        #list of chapters
        self.corpus = corpus
        
        #self.vectorizer = vectorizer
        
    #def vectorize(self):
    #    X = self.vectorizer.fit_transform(self.corpus)
    #    X = X.T.toarray()
    #    return X

In [5]:
corpus = make_corpus(kjv['KingJamesVersion'], chapters)
#doc = Documents(vectorizer, corpus)
#X = doc.vectorize()
#df = pd.DataFrame(X, index = doc.vectorizer.get_feature_names())

In [6]:
class Query:
    def __init__(self, text):
        
        #plain text
        self.text = text
        
        #self.vectorizer = vectorizer
        
    #def get_query_vec(self, docs):
    #    q = [self.query]
    #    q_vec = self.vectorizer.transform(q).toarray().reshape(docs.shape[0], )
    #    return q_vec

In [23]:
class InformationRetrieval:
    def __init__(self, docs):
        self.documents = docs
        self.vectorizer = TfidfVectorizer()
        self.inverted_doc = self.corpus_vectorizer()
        self.vocab_len = self.inverted_doc.shape[0]
        self.doc_len = len(self.documents.corpus)
        
    def corpus_vectorizer(self):
        X = self.vectorizer.fit_transform(self.documents.corpus)
        X = X.T.toarray()
        df = pd.DataFrame(X, index = self.vectorizer.get_feature_names())
        return df
    
    def query_vectorizer(self, query):
        q = [query.text]
        q_vec = self.vectorizer.transform(q).toarray().reshape(self.vocab_len,)
        return q_vec
    
    def retrieve(self, query):
        print(f"Vocab len {self.vocab_len}")
        q_vec = self.query_vectorizer(query)
        
        sim = {}
        for i in range(self.doc_len):
            sim[i] = np.dot(self.inverted_doc.loc[:, i], q_vec) \
                        / np.linalg.norm(self.inverted_doc.loc[:, i]) * np.linalg.norm(q_vec)
        
        sim_sorted = sorted(sim.items(), key = lambda x: x[1], reverse = True)
        
        candidates = []
        for i, vals in enumerate(sim_sorted):
            k, v = vals
            if v != 0.0:
                print(f'Similarity: {v}')
                print(f'With Document: {k}')
                candidates.append(self.documents.corpus[k])
            if i > 5:
                break
        return candidates

In [14]:
docs = Documents(corpus)
query = Query("Who are the sons of Abraham")
ir = InformationRetrieval(docs)
candidates = ir.retrieve(query)
candidates[0]

Vocab len 3467
Similarity: 0.3386962470885654
With Document: 0
Similarity: 0.20532031598657408
With Document: 38
Similarity: 0.19277463071004655
With Document: 33
Similarity: 0.1908007279529765
With Document: 23
Similarity: 0.18919385320046783
With Document: 1
Similarity: 0.1881514437567516
With Document: 4
Similarity: 0.18561207127666626
With Document: 50


"Adam, Sheth, Enosh,\nKenan, Mahalaleel, Jered,\nHenoch, Methuselah, Lamech,\nNoah, Shem, Ham, and Japheth.\nThe sons of Japheth; Gomer, and Magog, and Madai, and Javan, and Tubal, and Meshech, and Tiras.\nAnd the sons of Gomer; Ashchenaz, and Riphath, and Togarmah.\nAnd the sons of Javan; Elishah, and Tarshish, Kittim, and Dodanim.\nThe sons of Ham; Cush, and Mizraim, Put, and Canaan.\nAnd the sons of Cush; Seba, and Havilah, and Sabta, and Raamah, and Sabtecha. And the sons of Raamah; Sheba, and Dedan.\nAnd Cush begat Nimrod: he began to be mighty upon the earth.\nAnd Mizraim begat Ludim, and Anamim, and Lehabim, and Naphtuhim,\nAnd Pathrusim, and Casluhim, (of whom came the Philistines,) and Caphthorim.\nAnd Canaan begat Zidon his firstborn, and Heth,\nThe Jebusite also, and the Amorite, and the Girgashite,\nAnd the Hivite, and the Arkite, and the Sinite,\nAnd the Arvadite, and the Zemarite, and the Hamathite.\nThe sons of Shem; Elam, and Asshur, and Arphaxad, and Lud, and Aram, and

In [52]:
from transformers import T5Tokenizer, MT5Model

In [54]:
tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
type(tokenizer)

NoneType

In [46]:
type(AlbertTokenizer.from_pretrained('albert-xlarge-v2'))

NoneType

In [32]:
type(tokenizer)

NoneType