In [None]:
import collections
import operator
import os

from lxml import etree
import textblob

In [None]:
# All paths should be specified from the current directory
BOOK_OF_BIBLE = "All"
XML_FILE = "data/NASB.xmm"
OUTPUT_TO_FILE = True
OUTPUT_FILE_PATH = "output/"
BIBLE_TRANSLATION = XML_FILE.split("/")[-1].split(".")[0]

book_strings = collections.OrderedDict()
readme_string = "# {} NLP Reports\n\n".format(BIBLE_TRANSLATION)

In [None]:
root = etree.parse(XML_FILE)

In [None]:
if BOOK_OF_BIBLE == "" or BOOK_OF_BIBLE.lower() == "all":
    book_xml = root.xpath("//b")
else:
    book_xml = root.xpath("//b[@n='{}']".format(BOOK_OF_BIBLE))

In [None]:
for book in book_xml:
    book_strings[book.values()[0]] = str()
    for chapter in book:
        for verse in chapter:
            book_strings[book.values()[0]] += " {}".format(verse.text)

In [None]:
def _get_noun_phrases(noun_phrases, n=10):
    """Get the most common, n noun phrases."""
    noun_phrase_count = dict()

    for noun_phrase in noun_phrases:
        if noun_phrase_count.get(noun_phrase):
            noun_phrase_count[noun_phrase] += 1
        else:
            noun_phrase_count[noun_phrase] = 1

    # sort the noun_phrases by occurrence (in descending order)
    sorted_noun_phrase_count = sorted(noun_phrase_count.items(), key=operator.itemgetter(1), reverse=True)

    return sorted_noun_phrase_count[:n]


def full_report(bible_blob, book_of_bible):
    """Output a report for the given text blob."""
    global readme_string

    sorted_noun_phrases = _get_noun_phrases(bible_blob.noun_phrases)

    output_string = "# NLP Analysis for {} (using the {} translation)\n\n".format(book_of_bible.capitalize(), BIBLE_TRANSLATION)

    output_string += "## Sentiment\n\n"
    output_string += "Polarity†: {}\n\n".format(round(bible_blob.sentiment.polarity, 5))
    output_string += "Subjectivity‡: {}".format(round(bible_blob.sentiment.subjectivity, 5))

    output_string += "\n\n"

    output_string += "## Most Common Noun Phrases in {}:\n\n".format(book_of_bible.capitalize())
    for noun_phrase in sorted_noun_phrases:
        output_string += " * {}\t-  {}\n".format(noun_phrase[1], noun_phrase[0])

    output_string += "\n\n"

    output_string += "† Polarity is measured on a scale of [-1.0, 1.0] and measures whether that language used by the author is negative, neutral, or positive.\n\n"
    output_string += "‡ Subjectivity is measured on a scale of [0.0, 1.0] and measures how subjective the text is (0.0 being very objective; 1.0 being very subjective)."
    
    if OUTPUT_TO_FILE:
        with open(os.path.join(os.path.join(os.getcwd(), OUTPUT_FILE_PATH), '{}/{}_{}.md'.format(BIBLE_TRANSLATION, book_of_bible, BIBLE_TRANSLATION)), 'w+') as f:
            f.write(output_string)
            f.close()
        print("Wrote {}".format(book_of_bible))
        readme_string += "- [{}]({})\n".format(book_of_bible, '{}_{}.md'.format(book_of_bible, BIBLE_TRANSLATION))
    else:
        print(output_string)

In [None]:
for book in book_strings:
    t = textblob.TextBlob(book_strings[book])
    full_report(t, book)

In [None]:
with open(os.path.join(os.path.join(os.getcwd(), OUTPUT_FILE_PATH), '{}/README.md'.format(BIBLE_TRANSLATION)), 'w+') as f:
    f.write(readme_string)
    f.close()