In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
import nltk


In [3]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [4]:
#Set Bible text to a variable and print the first 200 characters 
bible = gutenberg.raw('bible-kjv.txt')
print('\nRaw:\n', bible[0:200])


Raw:
 [The King James Bible]

The Old Testament of the King James Bible

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.

1:2 And the earth was without 


In [5]:
#match text 
pattern = "[\[].*?[\]]"
kj_bible = re.sub(pattern, "", bible)

In [6]:
print(kj_bible[0:200])



The Old Testament of the King James Bible

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.

1:2 And the earth was without form, and void; and da


In [7]:
#remove white spaces 
kj_bible = ' '.join(kj_bible.split())
print(kj_bible[0:500])

The Old Testament of the King James Bible The First Book of Moses: Called Genesis 1:1 In the beginning God created the heaven and the earth. 1:2 And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters. 1:3 And God said, Let there be light: and there was light. 1:4 And God saw the light, that it was good: and God divided the light from the darkness. 1:5 And God called the light Day, and the darkness he called Ni


In [8]:
#load spaCy for parsing 
nlp = spacy.load('en')
kj_bible_doc = nlp(kj_bible[:200000])

print('Length of kj_bible_doc: {}'.format(len(kj_bible_doc)))


Length of kj_bible_doc: 45340


In [9]:
from collections import Counter
# create a function that counts the most common words 
def word_counter(text, include_stop=False):
    
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    
    return Counter(words)

In [10]:
kj_bible_words = word_counter(kj_bible_doc).most_common(20)
print('Bible word count:', kj_bible_words)

Bible word count: [('unto', 573), ('said', 470), ('thou', 271), ('thy', 261), ('thee', 261), ('shall', 245), ('God', 226), ('father', 184), ('land', 179), ('Jacob', 178), ('came', 175), ('LORD', 170), ('son', 159), ('sons', 143), ('Joseph', 137), ('Abraham', 130), ('earth', 120), ('man', 118), ('years', 111), ('went', 105)]


In [37]:
#count the lemmas within the text 
def lemma_count(text, include_stop=True):

    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
            
    
    return Counter(lemmas)

bible_lemma_count = lemma_count(kj_bible_doc, include_stop=False).most_common(20)
print('Bible Lemma Count:', bible_lemma_count)

Bible Lemma Count: [('unto', 580), ('say', 543), ('son', 302), ('thou', 271), ('thy', 262), ('thee', 261), ('come', 252), ('shall', 245), ('God', 226), ('father', 193), ('land', 185), ('man', 184), ('Jacob', 178), ('LORD', 170), ('brother', 166), ('Joseph', 137), ('day', 132), ('Abraham', 130), ('wife', 125), ('earth', 120)]


In [34]:
kj_bible_doc =[ kj for kj in kj_bible_doc if kj.text.isalpha()]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
vectorize = CountVectorizer()

bag_of_words = vectorize.fit_transform(kj_bible_doc)
bag_of_words