# Text Analytics - Discovery (EDA)

In [1]:
import nltk

#These only need to be done once...
#nltk.download('punkt')                      
#nltk.download('averaged_perceptron_tagger') 


In [2]:
print((nltk.__version__))

3.4.4


## Tokenize into sentences and words

In [3]:
#one big text... 

text = u"""
On this day, we gather because we have chosen hope over fear, unity of purpose over conflict and discord. 
On this day, we come to proclaim an end to the petty grievances and false promises, 
the recriminations and worn-out dogmas that for far too long have strangled our politics. We remain a young 
nation. But in the words of Scripture, the time has come to set aside childish things. The time has 
come to reaffirm our enduring spirit; to choose our better history; to carry forward that precious gift, 
that noble idea passed on from generation to generation: the God-given promise that all are equal, all 
are free, and all deserve a chance to pursue their full measure of happiness.
"""

# Sentence Tokenization
sentnc = nltk.sent_tokenize(text)

# Word Tokenization
wrds = [nltk.word_tokenize(s) for s in sentnc]

print("Sentences: {:>5} \n    Words: {:>5}".format(len(sentnc), sum(len(x)for x in wrds)))

Sentences:     5 
    Words:   136


In [4]:
#TODO: dont forget to ask... Should we clean these before we use them?
sentnc[2]

'We remain a young \nnation.'

In [18]:
wrds[2]

['We', 'remain', 'a', 'young', 'nation', '.']

## Part of Speech (POS) Tagging

In [5]:
#This will create a list of (token, POS tag) for each token in the doc

tagged_wt = [nltk.pos_tag(w)for w in wrds]     #<<< using the word list

## [[('On', 'IN'), ('this', 'DT'), ('day', 'NN'), (',', ','), ('we', 'PRP') ...

patternPOS= []    

#add all the parts of speech into that list: patternPOS
for tag in tagged_wt:
    patternPOS.append([v for k,v in tag])


print(patternPOS)


[['IN', 'DT', 'NN', ',', 'PRP', 'VBP', 'IN', 'PRP', 'VBP', 'VBN', 'NN', 'IN', 'NN', ',', 'NN', 'IN', 'NN', 'IN', 'NN', 'CC', 'NN', '.'], ['IN', 'DT', 'NN', ',', 'PRP', 'VBP', 'TO', 'VB', 'DT', 'NN', 'TO', 'DT', 'JJ', 'NNS', 'CC', 'JJ', 'NNS', ',', 'DT', 'NNS', 'CC', 'NN', 'NN', 'IN', 'IN', 'RB', 'RB', 'RB', 'VBP', 'VBN', 'PRP$', 'NNS', '.'], ['PRP', 'VBP', 'DT', 'JJ', 'NN', '.'], ['CC', 'IN', 'DT', 'NNS', 'IN', 'NNP', ',', 'DT', 'NN', 'VBZ', 'VBN', 'TO', 'VB', 'RB', 'JJ', 'NNS', '.'], ['DT', 'NN', 'VBZ', 'VBN', 'TO', 'VB', 'PRP$', 'VBG', 'NN', ':', 'TO', 'VB', 'PRP$', 'JJR', 'NN', ':', 'TO', 'VB', 'RB', 'IN', 'JJ', 'NN', ',', 'IN', 'JJ', 'NN', 'VBN', 'IN', 'IN', 'NN', 'TO', 'NN', ':', 'DT', 'NNP', 'NN', 'IN', 'DT', 'VBP', 'JJ', ',', 'DT', 'VBP', 'JJ', ',', 'CC', 'DT', 'VBP', 'DT', 'NN', 'TO', 'VB', 'PRP$', 'JJ', 'NN', 'IN', 'NN', '.']]


## Extracting Nouns or noun like words

In [6]:
nouns = []       #what will I do with this empty list?

for tag in tagged_wt:
    nouns.append([k for k,v in tag if v in ['NN','NNS','NNP','NNPS']])
    
nouns[0]

['day', 'hope', 'fear', 'unity', 'purpose', 'conflict', 'discord']

In [7]:
verbs = [] 

for tag in tagged_wt:
    verbs.append([k for k,v in tag if v in ['VB','VBD','VBG','VBN','VBP','VBZ']])

verbs[0:2]

[['gather', 'have', 'chosen'], ['come', 'proclaim', 'have', 'strangled']]

In [8]:
print("There are {} nouns and {} verbs in the text.".format(sum(len(l) for l in nouns), sum(len(l) for l in verbs)))

There are 32 nouns and 22 verbs in the text.


# spaCy

##### More feature extraction and explorations

spacy.__en_core_web_lg__

English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities. (https://spacy.io/models)



In [9]:
#These need to be downloaded before we can use them. 
#!python -m spacy download en_pytt_xlnetbasecased_lg  
#!python -m spacy download en_core_web_lg

In [10]:
#python -m spacy download en  #<<-- get the English language models
import spacy

print((spacy.__version__))

2.2.1


In [11]:
import spacy
#the 'nlp' is the model
nlp = spacy.load('en_core_web_lg')  #the _lg (large) has the word embeddings

doc = nlp(text)

print(doc.text)

for token in doc:
    print(token.text, token.pos_, token.dep_)


On this day, we gather because we have chosen hope over fear, unity of purpose over conflict and discord. 
On this day, we come to proclaim an end to the petty grievances and false promises, 
the recriminations and worn-out dogmas that for far too long have strangled our politics. We remain a young 
nation. But in the words of Scripture, the time has come to set aside childish things. The time has 
come to reaffirm our enduring spirit; to choose our better history; to carry forward that precious gift, 
that noble idea passed on from generation to generation: the God-given promise that all are equal, all 
are free, and all deserve a chance to pursue their full measure of happiness.


 SPACE 
On ADP prep
this DET det
day NOUN pobj
, PUNCT punct
we PRON nsubj
gather VERB ROOT
because SCONJ mark
we PRON nsubj
have AUX aux
chosen VERB advcl
hope NOUN dobj
over ADP prep
fear NOUN pobj
, PUNCT punct
unity NOUN conj
of ADP prep
purpose NOUN pobj
over ADP prep
conflict NOUN pobj
and CCONJ cc
d

In [12]:
spacy.explain("PROPN")

'proper noun'

In [13]:
spacy.explain("ADP") #am I a linguist? What is this?

'adposition'

In [14]:
##  https://glossary.sil.org/term/adposition  

In [15]:
from collections import Counter
from tabulate import tabulate

#token.lemma_ is the base form of the token, with no inflectional suffixes.

noun_counter = Counter(token.lemma_ for token in doc if token.pos_ == 'NOUN')

print(tabulate(noun_counter.most_common(5), headers=['Noun', 'Count']))

Noun          Count
----------  -------
day               2
promise           2
time              2
generation        2
hope              1


In [16]:
print(noun_counter.most_common())

[('day', 2), ('promise', 2), ('time', 2), ('generation', 2), ('hope', 1), ('fear', 1), ('unity', 1), ('purpose', 1), ('conflict', 1), ('discord', 1), ('end', 1), ('grievance', 1), ('recrimination', 1), ('dogma', 1), ('politic', 1), ('nation', 1), ('word', 1), ('scripture', 1), ('thing', 1), ('spirit', 1), ('history', 1), ('gift', 1), ('idea', 1), ('chance', 1), ('measure', 1), ('happiness', 1)]


## Named Entity Recognition (NER)

In [17]:
doc = nlp(u"My name is Srinivas and I live in India.")

entity_types = ((ent.text, ent.label_) for ent in doc.ents)

print(tabulate(entity_types, headers=['Entity', 'Entity Type']))

Entity    Entity Type
--------  -------------
Srinivas  PERSON
India     GPE


NLTK POS Tagger meanings 
- __CC__ coordinating conjunction

- __CD__ cardinal digit

- __DT__ determiner

- __EX__ existential there (like: “there is” … think of it like “there exists”)

- __FW__ foreign word

- __IN__ preposition/subordinating conjunction

- __JJ__ adjective ‘big’

- __JJR__ adjective, comparative ‘bigger’

- __JJS__ adjective, superlative ‘biggest’

- __LS__ list marker 1)

- __MD__ modal could, will

- __NN__ noun, singular ‘desk’

- __NNS__ noun plural ‘desks’

- __NNP__ proper noun, singular ‘Harrison’

- __NNPS__ proper noun, plural ‘Americans’

- __PDT__ predeterminer ‘all the kids’

- __POS__ possessive ending parent‘s

- __PRP__ personal pronoun I, he, she

- __PRP$__ possessive pronoun my, his, hers

- __RB__ adverb very, silently,

- __RBR__ adverb, comparative better

- __RBS__ adverb, superlative best

- __RP__ particle give up

- __TO__ to go ‘to‘ the store.

- __UH__ interjection errrrrrrrm

- __VB__ verb, base form take

- __VBD__ verb, past tense took

- __VBG__ verb, gerund/present participle taking

- __VBN__ verb, past participle taken

- __VBP__ verb, sing. present, non-3d take

- __VBZ__ verb, 3rd person sing. present takes

- __WDT__ wh-determiner which

- __WP__ wh-pronoun who, what

- __WP$__ possessive wh-pronoun whose

- __WRB__ wh-abverb where, when

__References:__

https://www.nltk.org/book/

http://www.nltk.org/howto/

https://www.geeksforgeeks.org/part-speech-tagging-stop-words-using-nltk-python/  

https://spacy.io/api

Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python. O’Reilly Media Inc.

Tukey, John (1977), Exploratory Data Analysis, Addison-Wesley.

Brown, D. W. (2016) Corpus of Presidential Speeches. Retrieved from http://www.thegrammarlab.com
