In [14]:
import nltk

### Tokenizing

In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize


In [16]:
sent = "This is a sentence. This is another sentence. Help!"

In [17]:
toks = sent_tokenize(sent)

In [18]:
toks

['This is a sentence.', 'This is another sentence.', 'Help!']

In [19]:
wtoks = word_tokenize(sent)

In [20]:
wtoks

['This',
 'is',
 'a',
 'sentence',
 '.',
 'This',
 'is',
 'another',
 'sentence',
 '.',
 'Help',
 '!']

### Stripping

#### remove stopwords from a text

In [35]:
text = "This is a very normal sentence but it has a lot of stopwords in it."

In [37]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
stopwords = nltk.corpus.stopwords.words('english')

In [42]:
toks = word_tokenize(text)

In [45]:
[w for w in toks if w.lower() not in stopwords]

['normal', 'sentence', 'lot', 'stopwords', '.']

### Stemming

In [22]:
stemmer = nltk.PorterStemmer()

In [23]:
text = ["apples", "animals", "animation", "several"]

In [26]:
[stemmer.stem(w) for w in text]

['appl', 'anim', 'anim', 'sever']

### Lemmatization

In [31]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
lem = nltk.WordNetLemmatizer()

In [33]:
[lem.lemmatize(w) for w in text]

['apple', 'animal', 'animation', 'several']

### POS Tagging

In [11]:
from nltk.tag import pos_tag

In [12]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
nltk.pos_tag(wtoks)

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('sentence', 'NN'),
 ('.', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('another', 'DT'),
 ('sentence', 'NN'),
 ('.', '.'),
 ('Help', 'NN'),
 ('!', '.')]

### A reusable pre-processing function 

In [40]:
### Preprocess a document for NLP.
### Takes a plain text document as input
### Returns a list of a list of POS-tagged tuples

def preprocess(document):
    sentences = sent_tokenize(document)
    sentences = [word_tokenize(sent) for sent in sentences]
    sentences = [pos_tag(sent) for sent in sentences]
    return sentences

### Noun-phrase chunking

In [64]:
text = "Software Development is such an important industry it is hard to imagine a world without it."

In [65]:
post_processed = preprocess(text)

In [97]:
### user-defined regular expression to chunk noun-phrases

grammar = ("NP: {"
           "<DT>?"           ### zero or one determiners, 
           "<JJ>*"           ### followed by an optional number of adjectives, 
           "<NN.*>+"         ### followed by any type or number of nouns
           "}")
                                        
                                        

#### helpful app to develop grammars

In [102]:
nltk.app.chunkparser()

#### NP chunker

In [105]:
### takes a pre-processed text, and a grammar (string) and return a result with the noun phrases "chunked"
def np_chunker(text, grammar):
    parser = nltk.RegexpParser(grammar)
    result = []
    for sent in post_processed:
        result.append(parser.parse(sent))
    return result

In [106]:
np_chunker(post_processed, grammar)

[Tree('S', [Tree('NP', [('Software', 'NNP'), ('Development', 'NNP')]), ('is', 'VBZ'), ('such', 'JJ'), Tree('NP', [('an', 'DT'), ('important', 'JJ'), ('industry', 'NN')]), ('it', 'PRP'), ('is', 'VBZ'), ('hard', 'JJ'), ('to', 'TO'), ('imagine', 'VB'), Tree('NP', [('a', 'DT'), ('world', 'NN')]), ('without', 'IN'), ('it', 'PRP'), ('.', '.')])]