Aditya Prakash V- Exploring Spacy as an Alternative to NLTK 

In [2]:
from spacy.lang.en import English
#from spacy.en import English

In [3]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()
#This library contains the corpus of proper english words unlike the myriad of corpuses available in NLTK.

In [4]:
text = """He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and 
fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had 
indeed the vaguest idea where the wood and river in question were."""

In [5]:
#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

In [6]:
# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)

In [7]:
from spacy.lang.en.stop_words import STOP_WORDS

# Create list of word tokens after removing stopwords
filtered_sentence =[] 

In [8]:
for word in token_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_sentence.append(word) 

In [9]:
print(token_list) 

['He', 'determined', 'to', 'drop', 'his', 'litigation', 'with', 'the', 'monastry', ',', 'and', 'relinguish', 'his', 'claims', 'to', 'the', 'wood', '-', 'cuting', 'and', '\n', 'fishery', 'rihgts', 'at', 'once', '.', 'He', 'was', 'the', 'more', 'ready', 'to', 'do', 'this', 'becuase', 'the', 'rights', 'had', 'become', 'much', 'less', 'valuable', ',', 'and', 'he', 'had', '\n', 'indeed', 'the', 'vaguest', 'idea', 'where', 'the', 'wood', 'and', 'river', 'in', 'question', 'were', '.']


In [10]:
print(filtered_sentence)

['determined', 'drop', 'litigation', 'monastry', ',', 'relinguish', 'claims', 'wood', '-', 'cuting', '\n', 'fishery', 'rihgts', '.', 'ready', 'becuase', 'rights', 'valuable', ',', '\n', 'vaguest', 'idea', 'wood', 'river', 'question', '.']


Text Normalization using spaCy

In [15]:
import en_core_web_sm
nlp = en_core_web_sm.load()

doc = nlp(u"""He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and 
fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had 
indeed the vaguest idea where the wood and river in question were.""")

lemma_word1 = [] 
for token in doc:
    lemma_word1.append(token.lemma_)
lemma_word1
# This is the lemmatisation of all the pronouns with the pronouns in the text labelled as pronouns.

['-PRON-',
 'determine',
 'to',
 'drop',
 '-PRON-',
 'litigation',
 'with',
 'the',
 'monastry',
 ',',
 'and',
 'relinguish',
 '-PRON-',
 'claim',
 'to',
 'the',
 'wood',
 '-',
 'cut',
 'and',
 '\n',
 'fishery',
 'rihgts',
 'at',
 'once',
 '.',
 '-PRON-',
 'be',
 'the',
 'more',
 'ready',
 'to',
 'do',
 'this',
 'becuase',
 'the',
 'right',
 'have',
 'become',
 'much',
 'less',
 'valuable',
 ',',
 'and',
 '-PRON-',
 'have',
 '\n',
 'indeed',
 'the',
 'vague',
 'idea',
 'where',
 'the',
 'wood',
 'and',
 'river',
 'in',
 'question',
 'be',
 '.']

Parts of Sentences Analysis

In [14]:

import en_core_web_sm
nlp = en_core_web_sm.load()

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)
#As we see below this is like worlds apart from NLTK as this gives the prefect workout of an nlp sentence as noun phrase and verb phrase which is not an available feaure in NLTK.

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'can', 'tell', 'would', 'shake', 'turn', 'talk', 'say']
Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun ORG
earlier this week DATE


POS tagging

In [13]:
import spacy

nlp = en_core_web_sm.load()
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
# The below is the breakup of the sentence into its gramatical parts word by word which cannot be expected in NLTK.

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False
