In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re
import string
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
nlp.max_length = 5000000

## Functions

In [4]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation, 
    remove words containing numbers and remove consecutive multiple white spaces.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(' +', ' ', text)
    return text

def remove_stop(doc_tokens):
    """removes stop words and returns a list of all nonstop words"""
    return [token.text for token in doc_tokens if not token.is_stop]

## Clean, Tokenize, remove stop words

In [5]:
clean_bible = clean_text((open('Bibletxt.txt', 'r').read().replace('\n', '')))

In [6]:
clean_koran = clean_text((open('English-Yusuf-Ali-59.txt', 'r').read().replace('\n', '')))

In [7]:
clean_tanakh = clean_text((open('Tanakh1917.txt', 'r', errors = 'ignore').read().replace('\n', '')))

In [8]:
bible_tokens = nlp(clean_bible)

In [9]:
koran_tokens = nlp(clean_koran)

In [10]:
tanakh_tokens = nlp(clean_tanakh)

In [11]:
no_stop_bible = remove_stop(bible_tokens)

In [12]:
no_stop_koran = remove_stop(koran_tokens)

In [13]:
no_stop_tanakh = remove_stop(tanakh_tokens)

In [17]:
no_stop_koran

['allah',
 'gracious',
 'mercifulpraise',
 'allah',
 'cherisher',
 'sustainer',
 'worldsmost',
 'gracious',
 'mercifulmaster',
 'day',
 'judgmentthee',
 'worship',
 'thine',
 'aid',
 'seekshow',
 'straight',
 'waythe',
 'way',
 'thou',
 'hast',
 'bestowed',
 'thy',
 'grace',
 'portion',
 'wrath',
 'astrayalmthis',
 'book',
 'guidance',
 'sure',
 'doubt',
 'fear',
 'allahwho',
 'believe',
 'unseen',
 'steadfast',
 'prayer',
 'spend',
 'provided',
 'themand',
 'believe',
 'revelation',
 'sent',
 'thee',
 'sent',
 'thy',
 'time',
 'hearts',
 'assurance',
 'hereafterthey',
 'true',
 'guidance',
 'lord',
 'prosperas',
 'reject',
 'faith',
 'thou',
 'warn',
 'warn',
 'believeallah',
 'hath',
 'set',
 'seal',
 'hearts',
 'hearing',
 'eyes',
 'veil',
 'great',
 'penalty',
 'incurof',
 'people',
 'believe',
 'allah',
 'day',
 'believefain',
 'deceive',
 'allah',
 'believe',
 'deceive',
 'realise',
 'notin',
 'hearts',
 'disease',
 'allah',
 'increased',
 'disease',
 'grievous',
 'penalty',
 'in

In [14]:
len(no_stop_bible)

340751

In [15]:
len(no_stop_koran)

64620

In [16]:
len(no_stop_tanakh)

259241

In [153]:
# displacy.render(no_stop_koran, style="dep")

ValueError: [E096] Invalid object passed to displaCy: Can only visualize Doc or Span objects, or dicts if set to manual=True.