In [10]:
import spacy
from nltk.tokenize import TweetTokenizer

In [5]:
nlp = spacy.load('en')
text = 'Mary, don\'t slap the green witch.'
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch', '.']


In [6]:
tweet = u'Snow White and the Seven Degrees#MakeAMovieCold@midnight:-)'
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


# N-gram

In [9]:
def n_grams(text, n):
    '''
    takes tokens or text, returns a list of n-grams
    '''
    return [text[i:i + n] for i in range(len(text)- n + 1)]

cleaned = ['mary', ',', "n't", 'slap', 'green', 'witch', '.']
print(n_grams(cleaned, 3))

[['mary', ',', "n't"], [',', "n't", 'slap'], ["n't", 'slap', 'green'], ['slap', 'green', 'witch'], ['green', 'witch', '.']]


# lemma and stemming (lemmatization)

In [13]:
nlp = spacy.load('en')
doc = nlp(u'he was running late')
for token in doc:
    print('{} --> {}'.format(token, token.lemma_))

he --> -PRON-
was --> be
running --> run
late --> late


# Part-Of-Speech

In [16]:
nlp = spacy.load('en')
doc = nlp(u'Mary slapped the green witch.')
for token in doc:
    print('{} - {}'.format(token, token.pos_))

Mary - PROPN
slapped - VERB
the - DET
green - ADJ
witch - NOUN
. - PUNCT


# Noun Phrase, NP Chunking

In [17]:
nlp = spacy.load('en')
doc = nlp(u'Mary slapped the green witch.')
for chunk in doc.noun_chunks:
    print('{} - {}'.format(chunk, chunk.label_))

Mary - NP
the green witch - NP
