# 📚 Import Libraries

In [1]:
import spacy

# 🌱 NLP spaCy Instance

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
type(nlp)

spacy.lang.en.English

In [4]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


# ⚙️ Processing a Text

In [10]:
text = (
    """
    When Sebastian Thrun started working on self-driving cars at
    Google in 2007, few people outside of the company took him
    seriously. “I can tell you very senior CEOs of major American
    car companies would shake my hand and turn away because I wasn't
    worth talking to,” said Thrun, in an interview with Recode earlier
    this week.
    """
)

In [11]:
doc = nlp(text)

In [14]:
type(doc)

spacy.tokens.doc.Doc

In [30]:
len(doc)

73

In [13]:
len(doc.vocab)

817

In [31]:
doc[2:5]

Sebastian Thrun started

# 🎫 Tokens

## Text

In [32]:
print("Text:", [token.text for token in doc])

Text: ['\n    ', 'When', 'Sebastian', 'Thrun', 'started', 'working', 'on', 'self', '-', 'driving', 'cars', 'at', '\n    ', 'Google', 'in', '2007', ',', 'few', 'people', 'outside', 'of', 'the', 'company', 'took', 'him', '\n    ', 'seriously', '.', '“', 'I', 'can', 'tell', 'you', 'very', 'senior', 'CEOs', 'of', 'major', 'American', '\n    ', 'car', 'companies', 'would', 'shake', 'my', 'hand', 'and', 'turn', 'away', 'because', 'I', 'was', "n't", '\n    ', 'worth', 'talking', 'to', ',', '”', 'said', 'Thrun', ',', 'in', 'an', 'interview', 'with', 'Recode', 'earlier', '\n    ', 'this', 'week', '.', '\n    ']


## Stop Words


In [52]:
print("Stop Words:", [token.text for token in doc if token.is_stop])

Stop Words: ['When', 'on', 'at', 'in', 'few', 'of', 'the', 'him', 'I', 'can', 'you', 'very', 'of', 'would', 'my', 'and', 'because', 'I', 'was', "n't", 'to', 'in', 'an', 'with', 'this']


In [51]:
nlp.Defaults.stop_words.add("btw")
nlp.vocab["btw"].is_stop = True

In [56]:
nlp.vocab["btw"].is_stop

True

## Alphanumeric

In [34]:
print("Alpha Numeric: ", [token.text for token in doc if token.is_alpha])

Alpha Numeric:  ['When', 'Sebastian', 'Thrun', 'started', 'working', 'on', 'self', 'driving', 'cars', 'at', 'Google', 'in', 'few', 'people', 'outside', 'of', 'the', 'company', 'took', 'him', 'seriously', 'I', 'can', 'tell', 'you', 'very', 'senior', 'CEOs', 'of', 'major', 'American', 'car', 'companies', 'would', 'shake', 'my', 'hand', 'and', 'turn', 'away', 'because', 'I', 'was', 'worth', 'talking', 'to', 'said', 'Thrun', 'in', 'an', 'interview', 'with', 'Recode', 'earlier', 'this', 'week']


## Capitalization

In [35]:
print("Capitalized: ", [token.text for token in doc if token.is_title])

Capitalized:  ['When', 'Sebastian', 'Thrun', 'Google', 'I', 'American', 'I', 'Thrun', 'Recode']


## Ponctuation

In [37]:
print("Pontuation: ", [token.text for token in doc if token.is_punct])

Pontuation:  ['-', ',', '.', '“', ',', '”', ',', '.']


## Numerical

In [38]:
print("Numerical: ", [token.text for token in doc if token.like_num])

Numerical:  ['2007']


## Sentence

In [39]:
print("Sentences: ", [sent.text for sent in doc.sents])

Sentences:  ['\n    ', 'When Sebastian Thrun started working on self-driving cars at\n    Google in 2007, few people outside of the company took him\n    seriously.', "“I can tell you very senior CEOs of major American\n    car companies would shake my hand and turn away because I wasn't\n    worth talking to,” said Thrun, in an interview with Recode earlier\n    this week.\n    "]


## Shape

In [41]:
print("Shape: ", [token.shape_ for token in doc])

Shape:  ['\n    ', 'Xxxx', 'Xxxxx', 'Xxxxx', 'xxxx', 'xxxx', 'xx', 'xxxx', '-', 'xxxx', 'xxxx', 'xx', '\n    ', 'Xxxxx', 'xx', 'dddd', ',', 'xxx', 'xxxx', 'xxxx', 'xx', 'xxx', 'xxxx', 'xxxx', 'xxx', '\n    ', 'xxxx', '.', '“', 'X', 'xxx', 'xxxx', 'xxx', 'xxxx', 'xxxx', 'XXXx', 'xx', 'xxxx', 'Xxxxx', '\n    ', 'xxx', 'xxxx', 'xxxx', 'xxxx', 'xx', 'xxxx', 'xxx', 'xxxx', 'xxxx', 'xxxx', 'X', 'xxx', "x'x", '\n    ', 'xxxx', 'xxxx', 'xx', ',', '”', 'xxxx', 'Xxxxx', ',', 'xx', 'xx', 'xxxx', 'xxxx', 'Xxxxx', 'xxxx', '\n    ', 'xxxx', 'xxxx', '.', '\n    ']


## Part of Speech (POS)

In [42]:
print("POS: ", [token.pos_ for token in doc])

POS:  ['SPACE', 'SCONJ', 'PROPN', 'PROPN', 'VERB', 'VERB', 'ADP', 'NOUN', 'PUNCT', 'VERB', 'NOUN', 'ADP', 'SPACE', 'PROPN', 'ADP', 'NUM', 'PUNCT', 'ADJ', 'NOUN', 'ADP', 'ADP', 'DET', 'NOUN', 'VERB', 'PRON', 'SPACE', 'ADV', 'PUNCT', 'PUNCT', 'PRON', 'AUX', 'VERB', 'PRON', 'ADV', 'ADJ', 'NOUN', 'ADP', 'ADJ', 'ADJ', 'SPACE', 'NOUN', 'NOUN', 'AUX', 'VERB', 'PRON', 'NOUN', 'CCONJ', 'VERB', 'ADV', 'SCONJ', 'PRON', 'AUX', 'PART', 'SPACE', 'ADJ', 'VERB', 'ADP', 'PUNCT', 'PUNCT', 'VERB', 'PROPN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'ADV', 'SPACE', 'DET', 'NOUN', 'PUNCT', 'SPACE']


## Dependency Parsing

In [43]:
print("Dependency: ", [token.dep_ for token in doc])

Dependency:  ['dep', 'advmod', 'compound', 'nsubj', 'advcl', 'xcomp', 'prep', 'npadvmod', 'punct', 'amod', 'pobj', 'prep', 'dep', 'pobj', 'prep', 'pobj', 'punct', 'amod', 'nsubj', 'advmod', 'prep', 'det', 'pobj', 'ROOT', 'dobj', 'dep', 'advmod', 'punct', 'punct', 'nsubj', 'aux', 'ccomp', 'dative', 'advmod', 'amod', 'nsubj', 'prep', 'amod', 'amod', 'dep', 'compound', 'pobj', 'aux', 'ccomp', 'poss', 'dobj', 'cc', 'conj', 'advmod', 'mark', 'nsubj', 'advcl', 'neg', 'dep', 'acomp', 'xcomp', 'prep', 'punct', 'punct', 'ROOT', 'nsubj', 'punct', 'prep', 'det', 'pobj', 'prep', 'pobj', 'advmod', 'dep', 'det', 'npadvmod', 'punct', 'dep']


## Lemmatization

In [45]:
print("Lemma: ", [token.lemma_ for token in doc])

Lemma:  ['\n    ', 'when', 'Sebastian', 'Thrun', 'start', 'work', 'on', 'self', '-', 'drive', 'car', 'at', '\n    ', 'Google', 'in', '2007', ',', 'few', 'people', 'outside', 'of', 'the', 'company', 'take', 'he', '\n    ', 'seriously', '.', '"', 'I', 'can', 'tell', 'you', 'very', 'senior', 'ceo', 'of', 'major', 'american', '\n    ', 'car', 'company', 'would', 'shake', 'my', 'hand', 'and', 'turn', 'away', 'because', 'I', 'be', "n't", '\n    ', 'worth', 'talk', 'to', ',', '"', 'say', 'Thrun', ',', 'in', 'an', 'interview', 'with', 'Recode', 'early', '\n    ', 'this', 'week', '.', '\n    ']


## Morphologic

In [46]:
print("Morph: ", [token.morph for token in doc])

Morph:  [, , Number=Sing, Number=Sing, Tense=Past|VerbForm=Fin, Aspect=Prog|Tense=Pres|VerbForm=Part, , Number=Sing, PunctType=Dash, Aspect=Prog|Tense=Pres|VerbForm=Part, Number=Plur, , , Number=Sing, , NumType=Card, PunctType=Comm, Degree=Pos, Number=Plur, , , Definite=Def|PronType=Art, Number=Sing, Tense=Past|VerbForm=Fin, Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs, , , PunctType=Peri, PunctSide=Ini|PunctType=Quot, Case=Nom|Number=Sing|Person=1|PronType=Prs, VerbForm=Fin, VerbForm=Inf, Person=2|PronType=Prs, , Degree=Pos, Number=Plur, , Degree=Pos, Degree=Pos, , Number=Sing, Number=Plur, VerbForm=Fin, VerbForm=Inf, Number=Sing|Person=1|Poss=Yes|PronType=Prs, Number=Sing, ConjType=Cmp, VerbForm=Inf, , , Case=Nom|Number=Sing|Person=1|PronType=Prs, Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin, Polarity=Neg, , Degree=Pos, Aspect=Prog|Tense=Pres|VerbForm=Part, , PunctType=Comm, PunctSide=Fin|PunctType=Quot, Tense=Past|VerbForm=Fin, Number=Sing, PunctType=Comm, , Defin

# 🏷️ Entity

In [50]:
print("Entity: ", [(ent.text, ent.label_) for ent in doc.ents])

Entity:  [('Sebastian Thrun', 'PERSON'), ('2007', 'DATE'), ('American\n    ', 'ORG'), ('Thrun', 'GPE'), ('Recode earlier\n    ', 'ORG'), ('this week', 'DATE')]


# ⚗️ Treating Stop Words

In [60]:
stop_words = set(nlp.Defaults.stop_words)
doc_tokens = set([token.text for token in doc])

tokens_without_sw = doc_tokens - stop_words

print("Tokens with stop words: ", doc_tokens)
print("Tokens without stop words: ", tokens_without_sw)

Tokens with stop words:  {'major', 'talking', 'Sebastian', 'him', 'with', 'company', 'senior', '-', 'to', 'my', 'American', 'turn', 'few', 'an', 'outside', 'earlier', 'this', 'CEOs', 'on', '”', 'seriously', 'working', ',', "n't", 'I', '.', '2007', 'was', 'car', 'said', '\n    ', 'you', 'self', 'can', 'When', 'hand', 'because', 'worth', 'driving', '“', 'Recode', 'and', 'very', 'companies', 'interview', 'week', 'away', 'the', 'shake', 'would', 'tell', 'Google', 'at', 'Thrun', 'cars', 'took', 'started', 'in', 'of', 'people'}
Tokens without stop words:  {'major', 'talking', 'Sebastian', 'company', 'senior', '-', 'American', 'turn', 'earlier', 'outside', 'CEOs', '”', 'seriously', 'working', ',', 'I', '.', '2007', 'car', 'said', '\n    ', 'worth', 'self', 'When', 'hand', 'driving', '“', 'Recode', 'companies', 'interview', 'week', 'away', 'shake', 'tell', 'Google', 'Thrun', 'cars', 'took', 'started', 'people'}


# 📖 Vocab