In [14]:
import nltk

### Tokenizing

In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize


In [16]:
sent = "This is a sentence. This is another sentence. Help!"

In [17]:
toks = sent_tokenize(sent)

In [18]:
toks

['This is a sentence.', 'This is another sentence.', 'Help!']

In [19]:
wtoks = word_tokenize(sent)

In [20]:
wtoks

['This',
 'is',
 'a',
 'sentence',
 '.',
 'This',
 'is',
 'another',
 'sentence',
 '.',
 'Help',
 '!']

### Stripping

#### remove stopwords from a text

In [35]:
text = "This is a very normal sentence but it has a lot of stopwords in it."

In [37]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
stopwords = nltk.corpus.stopwords.words('english')

In [42]:
toks = word_tokenize(text)

In [45]:
[w for w in toks if w.lower() not in stopwords]

['normal', 'sentence', 'lot', 'stopwords', '.']

### Stemming

In [124]:
stemmer = nltk.PorterStemmer()

In [125]:
text = ["apples", "animals", "animation", "several"]

In [126]:
[stemmer.stem(w) for w in text]

['appl', 'anim', 'anim', 'sever']

### Lemmatization

In [31]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
lem = nltk.WordNetLemmatizer()

In [33]:
[lem.lemmatize(w) for w in text]

['apple', 'animal', 'animation', 'several']

### POS Tagging

In [11]:
from nltk.tag import pos_tag

In [12]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
nltk.pos_tag(wtoks)

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('sentence', 'NN'),
 ('.', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('another', 'DT'),
 ('sentence', 'NN'),
 ('.', '.'),
 ('Help', 'NN'),
 ('!', '.')]

### A reusable pre-processing function 

In [40]:
### Preprocess a document for NLP.
### Takes a plain text document as input
### Returns a list of a list of POS-tagged tuples

def preprocess(document):
    sentences = sent_tokenize(document)
    sentences = [word_tokenize(sent) for sent in sentences]
    sentences = [pos_tag(sent) for sent in sentences]
    return sentences

### Noun-phrase chunking

In [113]:
text = "Software Development is such an important industry it is hard to imagine a world without it. Coal mining is also important, but it is a more dangerous job than software engineering."

In [114]:
post_processed = preprocess(text)

In [115]:
### user-defined regular expression to chunk noun-phrases

grammar = ("NP: {"           ### label: every noun-phrase that we detect will get this label
           "<DT>?"           ### zero or one determiners, 
           "<JJ>*"           ### followed by an optional number of adjectives, 
           "<NN.*>+"         ### followed by any type or number of nouns
           "}")
                                        
                                        

#### helpful app to develop grammars

In [116]:
nltk.app.chunkparser()

#### NP chunker

In [117]:
### takes a pre-processed text, and a grammar (string) and return a result with the noun phrases "chunked"
def np_chunker(text, grammar):
    parser = nltk.RegexpParser(grammar)
    result = []
    for sent in post_processed:
        result.append(parser.parse(sent))
    return result

In [118]:
chunked = np_chunker(post_processed, grammar)

In [119]:
chunked

[Tree('S', [Tree('NP', [('Software', 'NNP'), ('Development', 'NNP')]), ('is', 'VBZ'), ('such', 'JJ'), Tree('NP', [('an', 'DT'), ('important', 'JJ'), ('industry', 'NN')]), ('it', 'PRP'), ('is', 'VBZ'), ('hard', 'JJ'), ('to', 'TO'), ('imagine', 'VB'), Tree('NP', [('a', 'DT'), ('world', 'NN')]), ('without', 'IN'), ('it', 'PRP'), ('.', '.')]),
 Tree('S', [Tree('NP', [('Coal', 'NN'), ('mining', 'NN')]), ('is', 'VBZ'), ('also', 'RB'), ('important', 'JJ'), (',', ','), ('but', 'CC'), ('it', 'PRP'), ('is', 'VBZ'), ('a', 'DT'), ('more', 'RBR'), Tree('NP', [('dangerous', 'JJ'), ('job', 'NN')]), ('than', 'IN'), Tree('NP', [('software', 'NN'), ('engineering', 'NN')]), ('.', '.')])]

In [120]:
### now let's go through this result and print out only the noun phrase parts

for sentence in chunked:
    for n in sentence:
        if isinstance(n, nltk.tree.Tree):
            if n.label() == 'NP':
                print(n)
        

(NP Software/NNP Development/NNP)
(NP an/DT important/JJ industry/NN)
(NP a/DT world/NN)
(NP Coal/NN mining/NN)
(NP dangerous/JJ job/NN)
(NP software/NN engineering/NN)


### Named entity recognition

In [165]:
### in fact all the hard work has been done for us, with the standard ne_chunk function built into nltk

In [166]:
text = "The Washington Monument is the most prominent structure in Washington, D.C. and one of the city's early attractions. It was built in honor of George Washington, who led the country to independence and then became its first President."

In [167]:
pp_text = preprocess(text)

In [168]:
pp_text

[[('The', 'DT'),
  ('Washington', 'NNP'),
  ('Monument', 'NNP'),
  ('is', 'VBZ'),
  ('the', 'DT'),
  ('most', 'RBS'),
  ('prominent', 'JJ'),
  ('structure', 'NN'),
  ('in', 'IN'),
  ('Washington', 'NNP'),
  (',', ','),
  ('D.C.', 'NNP'),
  ('and', 'CC'),
  ('one', 'CD'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('city', 'NN'),
  ("'s", 'POS'),
  ('early', 'JJ'),
  ('attractions', 'NNS'),
  ('.', '.')],
 [('It', 'PRP'),
  ('was', 'VBD'),
  ('built', 'VBN'),
  ('in', 'IN'),
  ('honor', 'NN'),
  ('of', 'IN'),
  ('George', 'NNP'),
  ('Washington', 'NNP'),
  (',', ','),
  ('who', 'WP'),
  ('led', 'VBD'),
  ('the', 'DT'),
  ('country', 'NN'),
  ('to', 'TO'),
  ('independence', 'VB'),
  ('and', 'CC'),
  ('then', 'RB'),
  ('became', 'VBD'),
  ('its', 'PRP$'),
  ('first', 'JJ'),
  ('President', 'NNP'),
  ('.', '.')]]

In [172]:
entities = nltk.ne_chunk(pp_text[1])

In [177]:
print (entities)

(S
  It/PRP
  was/VBD
  built/VBN
  in/IN
  honor/NN
  of/IN
  (PERSON George/NNP Washington/NNP)
  ,/,
  who/WP
  led/VBD
  the/DT
  country/NN
  to/TO
  independence/VB
  and/CC
  then/RB
  became/VBD
  its/PRP$
  first/JJ
  President/NNP
  ./.)


In [170]:
print (entities.__repr__())    ### This is a workaround if you do not have ghostscript installed and gswin64c.exe added to the PATH

Tree('S', [('The', 'DT'), Tree('ORGANIZATION', [('Washington', 'NNP'), ('Monument', 'NNP')]), ('is', 'VBZ'), ('the', 'DT'), ('most', 'RBS'), ('prominent', 'JJ'), ('structure', 'NN'), ('in', 'IN'), Tree('GPE', [('Washington', 'NNP')]), (',', ','), Tree('GPE', [('D.C.', 'NNP')]), ('and', 'CC'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('city', 'NN'), ("'s", 'POS'), ('early', 'JJ'), ('attractions', 'NNS'), ('.', '.')])
