# Language Modeling
* Machine Translation
* Document Summarization
* Topic Modeling
* N-Gram Analysis
    * Document Similarity
* Text Generation

In [None]:
# Document similarity

from collections import OrderedDict
#import textacy
#import spacy

def ngrams(sentence,n):
    sentence = sentence.replace("\n"," ").replace("\r"," ")
    input_list = [elem for elem in sentence.split(" ") if elem != '']
    return list(zip(*[input_list[i:] for i in range(n)]))

def document_similarity(document_a,document_b,max_ngram_size=10):
    """
    Here we compute document similarity by looking at i-grams of up to length max_ngram_size.
    We can use this to inspect data across some further semantic divide.  For example,
    assume we have split ads by phone number and then joined all ads with the same phone number.
    This function could be used to examine the phrase similarity between two given phone numbers
    across many ads.  Or if you like, it could be used to consider all ads pairwise, based on split by
    phone number.
    @document_a - each document is defined above, across some semantic split.  It is of type string
    @document_b - the same as document_a except with some categorical value that is different
    @max_ngram_size - the default size is 10.  This is an integer and states how many grams to split into
    a 2-gram is words of 2 groups, a 3-gram is words of group length 3.
    To see this in action, I recommend running the ngrams function (defined above) with a string, number of grams as input.
    """
    document_a,document_b  = document_a.lower(),document_b.lower()
    ngrams_doc_a = [ngrams(document_a,i) for i in range(1,max_ngram_size)]
    ngrams_doc_b = [ngrams(document_b,i) for i in range(1,max_ngram_size)]
    similarity_scores = []
    for i_gram in range(0,len(ngrams_doc_a)):
        similarity_count = 0
        for elem_a in ngrams_doc_a[i_gram]:
            if elem_a in ngrams_doc_b[i_gram]:
                similarity_count += 1
        similarity_scores.append(similarity_count/float(len(ngrams_doc_a[i_gram])))
    similarity_score_with_n_grams = {}
    for index in range(0,len(ngrams_doc_a)):
        similarity_score_with_n_grams[index+1] = similarity_scores[index]
    return similarity_score_with_n_grams

def phrase_frequency(documents,max_ngram_size=10):
    """
    Here we compute phrase frequency over a set of documents.  This gives us the absolute frequency of how often a phrase was used,
    as well as the relative frequency that the term was used compared to all other terms of the same gram size.
    
    """
    documents = "\n".join([document.lower() for document in documents])
    ngrams_doc = [ngrams(documents,i) for i in range(1,max_ngram_size)]
    ngrams_doc = [elem for elem in ngrams_doc if elem != []] 
    similarity_scores = {}
    [similarity_scores.update({}.fromkeys(phrases,{"absolute frequency":1,"relative frequency":1})) for phrases in ngrams_doc] 
    for i_gram in range(0,len(ngrams_doc)):
        for index,elem in enumerate(ngrams_doc[i_gram]):
            if elem in ngrams_doc[i_gram][:index] or elem in ngrams_doc[i_gram][index+1:]:
                similarity_scores[elem]["absolute frequency"] += 1            
        similarity_scores[elem]["relative frequency"] = similarity_scores[elem]["absolute frequency"]/float(len(ngrams_doc[i_gram]))
    similarity_scores = OrderedDict(similarity_scores)
    return sorted(similarity_scores.items(), key=lambda x: x[1]["relative frequency"], reverse=True)[:10]
 


In [1]:
# Topic Modeling

# https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/
# http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-topics-extraction-with-nmf-lda-py

# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
done in 1.877s.
Extracting tf-idf features for NMF...
done in 0.445s.
Extracting tf features for LDA...
done in 0.482s.
Fitting the NMF model with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.286s.

Topics in NMF model:
Topic #0:
just people don think like know time good make way really say right ve want did ll new use years
Topic #1:
windows use dos using window program os drivers application help software pc running ms screen files version card code work
Topic #2:
god jesus bible faith christian christ christians does heaven sin believe lord life church mary atheism belief human love religion
Topic #3:
thanks know does mail advance hi info interested email anybody looking card help like appreciated information send list video need
Topic #4:
car cars tires miles 00 new engine insurance price condition oil power speed good 000 brake year models used bought
Topic #5:
edu soon com send university internet mit ftp mail cc pub article information hope

In [4]:
# Document Summarization

# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words


LANGUAGE = "english"
SENTENCES_COUNT = 3


if __name__ == "__main__":
    
    # or for plain text files
    parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)

"There are quite a lot of experiments showing that the human sense of smell is pretty similar to what you can find with a rat or a mouse or a dog."
"He was interested in free will and he had this idea that smell was this very animalistic sense and that it compelled animals to have sex and feed," McGann says.
"But, in fact, 400 is an awful lot," argues McGann, "and, quite honestly, there are very few odors that are volatile enough to get into the air that humans can't smell."


In [5]:
# Text Generation

# With RNN: http://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/
# With Markov Chain: https://github.com/codebox/markov-text - basic example
# https://github.com/jsvine/markovify
import markovify

# Get raw text as string.
with open("document.txt") as f:
    text = f.read()

# Build the model.
text_model = markovify.Text(text)

# Print five randomly-generated sentences
for i in range(5):
    print(text_model.make_sentence())

# Print three randomly-generated sentences of no more than 140 characters
for i in range(3):
    print(text_model.make_short_sentence(140))


As scientists in the journal Science.
None
He published a paper about his findings Thursday in the lab, but there's no rule that says humans always have to come out ahead.
But, Horowitz says, if the argument is that humans have about 400 distinct smell receptors in rats.
As scientists in the lab, but there's no rule that says humans always have to come out ahead.
But one scientist argues the idea back to the mid-1800s, and the work of a caress.
He published a paper about his findings Thursday in the lab, but there's no consistent winner.
He published a paper about his findings Thursday in the lab, but there's no rule that says humans always have to come out ahead.


In [None]:
#Machine Translation
#Reference https://github.com/EricSchles/RNN-data-gen/blob/master/fully_connected_dnn_training.py
# The unreasonable effectiveness of RNNs: http://karpathy.github.io/2015/05/21/rnn-effectiveness/


# The general setup for machine translation

Cleaning Data

tokenization -> lemattization -> stemming.

Convert to word vectors

1) create a vocabulary of words

2) unique mapping from each word to an index

3) use the word mapping to convert the input text to word vectors

Defining a model

1) use a word embedding to embed the input word into a 2D array
2) use a decoder (with an encoder) composing of RNNs or LSTMs to make the predictions.

3) apply some fully connected layers on top of the decoder ouput to make the actual prediction

