In [1]:
import glob
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer

def setup_script():
    """
    Make sure script runs smoothly
    """

    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')


def extract_documents(file_paths):
    """
    Takes a list of file paths as an arg.
    Returns a list of documents as strings

    When opening the file below we must specifcy
    an encoding as if we leave this none it'll use
    locale.getpreferredencoding() which might have
    unexpected results. In my case for windows
    this is 'cp1252'. So it will error as
    0x9d isn't defined in cp1252. It's a
    'RIGHT DOUBLE QUOTATION MARK'
    """

    documents_dict = {
        "doc_name": [],
        "raw_doc": []
    }
    for file_path in file_paths:
        f = open(file_path, "r", encoding="utf8")
        document = f.read()

        doc_name = file_path.split("\\")[1].split(".")[0]
        documents_dict['raw_doc'].append(document)
        documents_dict['doc_name'].append(doc_name)

        f.close()

    return pd.DataFrame(documents_dict)


def tokenize_document_words(docs_pdf):
    """
    Splitting each document into words
    and adding the result to a column in
    the pandas dataframe
    """
    
    tokenizer = RegexpTokenizer(r'\w+')
    doc_words = [
        tokenizer.tokenize(
            document
        ) for document in docs_pdf.raw_doc
    ]
    docs_pdf['doc_words'] = doc_words


def split_docs_to_sentences(docs_pdf):
    """
    Takes a documents dataframe as an arguement.
    Break each document down into sentences
    """

    get_doc_sentences = lambda document: [
        sentence.replace("\n", "") for sentence in document.split(".")
    ]
    docs_pdf['sentences'] = docs_pdf.raw_doc.apply(get_doc_sentences)


def clean_and_preprocess_docs(docs_pdf):
    """
    Takes a document dataframe.
    Preprocesses each of the tokenized columns in the
    document pdf.
    Pre-processing steps include decaptilizing
    and removing stopwords.
    """
 
    decap_words = lambda document: [
        word.lower() for word in document
    ]
    docs_pdf.doc_words = docs_pdf.doc_words.apply(decap_words)

    stop_words = stopwords.words('english')
    remove_stop_words = lambda document: [
        word for word in document if word not in stop_words
    ]
    docs_pdf.doc_words = docs_pdf.doc_words.apply(remove_stop_words)
    

def advanced_preprocessing(docs_pdf):
    """
    Stemming and Lemmatization are useful
    to really lock into the subject of a
    document. If we can reduce words to their
    base it means we've a better chance of seeing
    patterns then if we treated each varaiton of the
    same word as a different word.
    """

    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    lemmatize_words = lambda document: [
        lemmatizer.lemmatize(word) for word in document
    ]
    stem_words = lambda document: [
        stemmer.stem(word) for word in document
    ]
    docs_pdf.doc_words = docs_pdf.doc_words.apply(lemmatize_words)
    docs_pdf.doc_words = docs_pdf.doc_words.apply(stem_words)


def generate_word_summary_data(docs_pdf):
    """
    """
    
    # Get the frequency of eac word for each document
    docs_pdf['word_frequencies'] = docs_pdf.doc_words.apply(
        nltk.FreqDist
    )

    get_bigrams = lambda doc_words: [
        bigram for bigram in ngrams(doc_words, 2)
    ]

    get_trigrams = lambda doc_words: [
        trigram for trigram in ngrams(doc_words, 3)
    ]

    # Generate all bigrams and trigrams
    docs_pdf['bigrams'] = docs_pdf.doc_words.apply(get_bigrams)
    docs_pdf['trigrams'] = docs_pdf.doc_words.apply(get_trigrams)    
    
    # Get the bigram and trigram frequency
    docs_pdf['bigrams_freq'] = docs_pdf.bigrams.apply(Counter)
    docs_pdf['trigrams_freq'] = docs_pdf.trigrams.apply(Counter)

    
def get_summary_for_iterable(word, docs_pdf, iterable_col):
    """
    Takes the word, document dataframe and
    the column taking the iterable as an agrement.
    Returns an occurance summary for the words occurances in
    this iterable column. The iterable columns we
    can include here are the sentences, bigrams
    and trigrams.
    """

    freq_dict = {
        doc:0 for doc in docs_pdf.doc_name
    }

    # Gather all sentences the word occurs in for each doc
    occurances = [
        (row.doc_name, obj) for index, row in docs_pdf.iterrows() for obj in row[iterable_col]
        if word in obj
    ]

    # Get the number of sentence occurances for the word in each doc
    for obj in occurances:
        freq_dict[obj[0]] += 1

    # total number of sentences occurances for the word
    total_frequency = sum(freq_dict.values())
    
    return {
        "num_occurances_in_{}_by_doc".format(iterable_col): freq_dict,
        "total_{}_occurances".format(iterable_col[:-1]): total_frequency,
        iterable_col + "_occurances": occurances
    }


def get_full_word_summary(docs_pdf, word):
    """
    Takes a single word and the doc_pdf
    as an arguement and returns a
    summary of this word in the
    doc corpus.
    """

    word_freq_dict = {
        row['doc_name']:row['word_frequencies'][word]
        for index, row in docs_pdf.iterrows() if word in row['word_frequencies']
    }
    total_occurances = sum(word_freq_dict.values())

    sentence_summary = get_summary_for_iterable(word, docs_pdf, 'sentences')
    bigram_summary = get_summary_for_iterable(word, docs_pdf, 'bigrams')
    trigram_summary = get_summary_for_iterable(word, docs_pdf, 'trigrams')

    return {
        "word": word,
        "num_occurances_by_doc": word_freq_dict,
        "total_occurances": total_occurances,
        **{
            key:value
            for summary in [sentence_summary, bigram_summary, trigram_summary]
            for key, value in summary.items()
        }
    }


def get_tf_idf_matrix(docs_pdf):
    """
    Takes the doc_pdf as an arguement
    returns the TF-IDF matrix for
    the corpus of documents
    
    """

    vectorizer = TfidfVectorizer()
    response = vectorizer.fit_transform(
        [
            ' '.join(words) for words in docs_pdf.doc_words
        ]
    )
    tf_idf = pd.DataFrame(response.toarray(), columns=vectorizer.get_feature_names())
    return tf_idf


def is_word_of_interest(word, pos_type="noun"):
    """
    Takes word and pos_type as agruement
    returns true if the word 
    """
    if pos_type == 'noun':
        post_list = ["NN", "NNP", "NNPS", "NST", "NNS"]
    elif pos_type in ["adjective", "verb"]:
        post_list = [
            "JJ", "JJR", "JJS", "RB", "RBR", "RBS",
            "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"
        ]
        
    return (
        nltk.pos_tag([word])[0][1] in post_list
    )


def get_summaries_for_unique_words(docs_pdf):
    """
    Generates a summary for all
    unique words in the corpus
    """

    unique_words = set([word for words in docs_pdf.doc_words for word in words])
    word_summaries = []
    for word in unique_words:
        word_summaries.append(
            get_full_word_summary(
                docs_pdf=docs_pdf,
                word=word
            )
        )
    return pd.DataFrame(word_summaries)


def get_docs_sorted_tf_idf_lookup_dicts(tf_idf, docs_pdf):
    """
    
    """
    
    tf_idf_lookup_dict = {}
    get_doc_name = lambda name: dict(docs_pdf.doc_name)[name]

    for index, row in tf_idf.iterrows():
        tf_idf_lookup_dict[get_doc_name(row.name)] = {
            k: v for k, v in sorted(dict(row).items(), key=lambda item: item[1], reverse=True)
        }
    return tf_idf_lookup_dict


def generate_top30_interesting_words(doc_name, tf_idf_lookup_dict, pos_type):
    """
    Generate the top30 most interesting
    nouns and verbs for each of the docs.
    Our interest metric coming from the
    TF-IDF matrix values and our parts
    of speech from the NLTK libary.
    Write the summary of these top 30
    words to a csv
    """

    count = 0
    interesting_words = []
    for word in tf_idf_lookup_dict[doc_name]:
        if len(interesting_words) == 30:
            break
        else:
            if is_word_of_interest(word, pos_type=pos_type):
                interesting_words.append(word)
                
    return pd.DataFrame(
        [
            {
                "tf_idf_value": tf_idf_lookup_dict[doc_name][word],
             **get_full_word_summary(docs_pdf, word=word)
            } for word in interesting_words
        ]
    ).to_csv("files/outputs/document_specific/{}_top30_interesting_{}s.csv".format(doc_name, pos_type))

In [2]:
setup_script()
file_paths = glob.glob("./files/inputs/*.txt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
# Inital Cleaning and Preprocessing of documents
docs_pdf = extract_documents(file_paths)

split_docs_to_sentences(docs_pdf=docs_pdf)
tokenize_document_words(docs_pdf=docs_pdf)
clean_and_preprocess_docs(docs_pdf=docs_pdf)

docs_pdf.to_csv("files/outputs/overall_document_summary.csv")

generate_word_summary_data(docs_pdf=docs_pdf)
word_summary_pdf = get_summaries_for_unique_words(docs_pdf=docs_pdf)

word_summary_pdf.to_csv("files/outputs/all_words_summary.csv")

tf_idf = get_tf_idf_matrix(docs_pdf=docs_pdf)

word_summary_pdf.to_csv("files/outputs/tf_idf_matrix.csv")

In [4]:
tf_idf_lookup_dict = get_docs_sorted_tf_idf_lookup_dicts(
    tf_idf,
    docs_pdf
)

In [5]:
for doc_name in tf_idf_lookup_dict.keys():
    generate_top30_interesting_words(
        doc_name,
        tf_idf_lookup_dict,
        pos_type="noun"
    )
for doc_name in tf_idf_lookup_dict.keys():
    generate_top30_interesting_words(
        doc_name,
        tf_idf_lookup_dict,
        pos_type="adjective"
    )