#Getting meaning from a text

###Using TextBlob, NLTK and Gensim

In [2]:
import string

from nltk.stem.wordnet import WordNetLemmatizer

from gensim import models
from gensim.corpora import Dictionary

In [3]:
## Example text:
zen = ["Beautiful is better than ugly. Explicit is better than implicit.", 
        "Simple is better than complex. Complex is better than complicated.",
        "Flat is better than nested. Sparse is better than dense.",
        "Readability counts. Special cases aren't special enough to break the rules.",
        "Although practicality beats purity. Errors should never pass silently.",
        "Unless explicitly silenced. In the face of ambiguity, refuse the temptation to guess." ,
        "There should be one-- and preferably only one --obvious way to do it.",
        "Although that way may not be obvious at first unless you're Dutch.",
        "Now is better than never. Although never is often better than *right* now.",
        "If the implementation is hard to explain, it's a bad idea.",
        "If the implementation is easy to explain, it may be a good idea."
        "Namespaces are one honking great idea -- let's do more of those!"]

In [4]:
## TOKENISATION: Breaking down a text in meaningful elements
texts = [text.lower().replace('\n', ' ').split(' ') for text in zen]

stop_words = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for'
             'from', 'has', 'he', 'if', 'in', 'is', 'it', 'its', 'it\'s', 'of', 'on', 
             'than', 'that', 'the', 'to', 'was', 'were', 'will', 'with']

docs = [[filter(lambda x:x not in string.punctuation, i) for i in txt if i != '' and i not in stop_words] 
        for txt in texts]
print docs

[['beautiful', 'better', 'ugly', 'explicit', 'better', 'implicit'], ['simple', 'better', 'complex', 'complex', 'better', 'complicated'], ['flat', 'better', 'nested', 'sparse', 'better', 'dense'], ['readability', 'counts', 'special', 'cases', 'arent', 'special', 'enough', 'break', 'rules'], ['although', 'practicality', 'beats', 'purity', 'errors', 'should', 'never', 'pass', 'silently'], ['unless', 'explicitly', 'silenced', 'face', 'ambiguity', 'refuse', 'temptation', 'guess'], ['there', 'should', 'one', 'preferably', 'only', 'one', 'obvious', 'way', 'do', 'it'], ['although', 'way', 'may', 'not', 'obvious', 'first', 'unless', 'youre', 'dutch'], ['now', 'better', 'never', 'although', 'never', 'often', 'better', 'right', 'now'], ['implementation', 'hard', 'explain', 'bad', 'idea'], ['implementation', 'easy', 'explain', 'may', 'good', 'ideanamespaces', 'one', 'honking', 'great', 'idea', '', 'lets', 'do', 'more', 'those']]


In [5]:
## LEMMATISATION:
## Grouping together the different forms of a word
lmtzr = WordNetLemmatizer()
lemm = [[lmtzr.lemmatize(word) for word in data] for data in docs]
print lemm

[['beautiful', 'better', 'ugly', 'explicit', 'better', 'implicit'], ['simple', 'better', 'complex', 'complex', 'better', 'complicated'], ['flat', 'better', 'nested', 'sparse', 'better', 'dense'], ['readability', u'count', 'special', u'case', 'arent', 'special', 'enough', 'break', u'rule'], ['although', 'practicality', u'beat', 'purity', u'error', 'should', 'never', u'pas', 'silently'], ['unless', 'explicitly', 'silenced', 'face', 'ambiguity', 'refuse', 'temptation', 'guess'], ['there', 'should', 'one', 'preferably', 'only', 'one', 'obvious', 'way', 'do', 'it'], ['although', 'way', 'may', 'not', 'obvious', 'first', 'unless', 'youre', 'dutch'], ['now', 'better', 'never', 'although', 'never', 'often', 'better', 'right', 'now'], ['implementation', 'hard', 'explain', 'bad', 'idea'], ['implementation', 'easy', 'explain', 'may', 'good', 'ideanamespaces', 'one', 'honking', 'great', 'idea', '', u'let', 'do', 'more', 'those']]


In [10]:
## Create bag of words from dictionnary:
####note: compare doc2bow and word2vec
dictionary = Dictionary(lemm)
dictionary.save('text.dict')

## Term frequency–inverse document frequency (TF-IDF)
## Method to reflect how important a word is to a document in a collection.
## The inverse document frequency measures whether the term is common or rare across all documents.

bow = [dictionary.doc2bow(l) for l in lemm] # Calculates inverse document counts for all terms
print "BAG OF WORDS: Assign a frequency to a word index \n", bow

# Transform the count representation into the Tfidf space
tfidf = models.TfidfModel(bow)              
corpus_tfidf = tfidf[bow]
print "\nTF-IDF: value associated with the importance of word in a document\n"
for doc in corpus_tfidf:
    print doc

BAG OF WORDS: Assign a frequency to a word index 
[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1)], [(1, 2), (5, 2), (6, 1), (7, 1)], [(1, 2), (8, 1), (9, 1), (10, 1), (11, 1)], [(12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2)], [(20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)], [(29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1)], [(27, 1), (37, 1), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1)], [(20, 1), (36, 1), (39, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1)], [(1, 2), (20, 1), (23, 2), (50, 2), (51, 1), (52, 1)], [(53, 1), (54, 1), (55, 1), (56, 1), (57, 1)], [(37, 1), (40, 1), (47, 1), (54, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1)]]

TF-IDF: value associated with the importance of word in a document

[(0, 0.46068284809775906), (1, 0.38869686630348355), (2, 0.46068284809775906), (3, 0.46068284809775906), (4, 0.4606828480977

In [14]:
## Build the LSI (latent semantic indexing) model
## Method to uses a mathematical technique called singular value decomposition (SVD) 
## to identify patterns in the relationships between the terms and concepts contained 
## in an unstructured collection of text.
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=3)
corpus_lsi = lsi[corpus_tfidf]

for doc in corpus_lsi:
    print(doc)

[(0, 0.54498357184461965), (1, -0.2126075637752336), (2, 0.20331906793399038)]
[(0, 0.49872076945560406), (1, -0.19698495321056153), (2, 0.200134661406946)]
[(0, 0.54498357184462021), (1, -0.21260756377523396), (2, 0.20331906793399096)]
[]
[(0, 0.29788909810577829), (1, 0.11063707308686081), (2, -0.49983116799619542)]
[(0, 0.037992769637636571), (1, 0.088009881979444121), (2, -0.21744299165958814)]
[(0, 0.19434230433693411), (1, 0.52719369649662684), (2, -0.29033187162354179)]
[(0, 0.19783074635880496), (1, 0.38294184000348508), (2, -0.44863074249669871)]
[(0, 0.57585999369002672), (1, -0.09944815362681389), (2, -0.23138923147744478)]
[(0, 0.10520154691688784), (1, 0.50564529531779667), (2, 0.52535252894323625)]
[(0, 0.17251625291638517), (1, 0.69288699106334739), (2, 0.34135753883295711)]


In [15]:
for i in range(lsi.num_topics):
    print lsi.show_topic(i)

[(0.5409971672782129, u'better'), (0.27853037146820514, u'complex'), (0.27731795093733874, u'now'), (0.2538440596703842, u'never'), (0.18163236900752422, u'beautiful'), (0.18163236900752411, u'dense'), (0.18163236900752402, u'flat'), (0.18163236900752402, u'nested'), (0.18163236900752394, u'ugly'), (0.18163236900752391, u'sparse')]
[(0.30794775806820196, u'one'), (0.25292223047307438, u'explain'), (0.25292223047307438, u'implementation'), (0.25292223047307438, u'idea'), (0.20778805050884408, u'do'), (0.20437009294542946, u'bad'), (0.2043700929454294, u'hard'), (-0.19505419477988611, u'better'), (0.18973512044056642, u'may'), (0.18226648505043833, u'way')]
[(0.24330841921959795, u'bad'), (0.24330841921959795, u'hard'), (0.23373524267885862, u'implementation'), (0.23373524267885862, u'explain'), (0.23373524267885862, u'idea'), (-0.20928347389567462, u'never'), (-0.20727316973039475, u'although'), (-0.17738986606596757, u'should'), (-0.17342762797286926, u'way'), (-0.17342762797286926, u'

In [18]:
list_topics = [] 
for i in range(lsi.num_topics):
    list_topics.extend(lsi.show_topic(i))

list_topics.sort(key=lambda tup: tup[0], reverse=True)

topics = [i[1] for i in list_topics]
print topics

[u'better', u'one', u'complex', u'now', u'never', u'explain', u'implementation', u'idea', u'bad', u'hard', u'implementation', u'explain', u'idea', u'do', u'bad', u'hard', u'may', u'way', u'beautiful', u'dense', u'flat', u'nested', u'ugly', u'sparse', u'way', u'obvious', u'should', u'better', u'although', u'never']
