# 📚 Import Libraries

In [1]:
import spacy

# 🌱 NLP spaCy Instance

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
type(nlp)

spacy.lang.en.English

In [4]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


# ⚙️ Processing a Text

In [10]:
text = (
    """
    When Sebastian Thrun started working on self-driving cars at
    Google in 2007, few people outside of the company took him
    seriously. “I can tell you very senior CEOs of major American
    car companies would shake my hand and turn away because I wasn't
    worth talking to,” said Thrun, in an interview with Recode earlier
    this week.
    """
)

In [11]:
doc = nlp(text)

In [14]:
type(doc)

spacy.tokens.doc.Doc

In [30]:
len(doc)

73

In [13]:
len(doc.vocab)

817

In [31]:
doc[2:5]

Sebastian Thrun started

# 🎫 Tokens

## Text

In [32]:
print("Text:", [token.text for token in doc])

Text: ['\n    ', 'When', 'Sebastian', 'Thrun', 'started', 'working', 'on', 'self', '-', 'driving', 'cars', 'at', '\n    ', 'Google', 'in', '2007', ',', 'few', 'people', 'outside', 'of', 'the', 'company', 'took', 'him', '\n    ', 'seriously', '.', '“', 'I', 'can', 'tell', 'you', 'very', 'senior', 'CEOs', 'of', 'major', 'American', '\n    ', 'car', 'companies', 'would', 'shake', 'my', 'hand', 'and', 'turn', 'away', 'because', 'I', 'was', "n't", '\n    ', 'worth', 'talking', 'to', ',', '”', 'said', 'Thrun', ',', 'in', 'an', 'interview', 'with', 'Recode', 'earlier', '\n    ', 'this', 'week', '.', '\n    ']


## Stop Words


In [52]:
print("Stop Words:", [token.text for token in doc if token.is_stop])

Stop Words: ['When', 'on', 'at', 'in', 'few', 'of', 'the', 'him', 'I', 'can', 'you', 'very', 'of', 'would', 'my', 'and', 'because', 'I', 'was', "n't", 'to', 'in', 'an', 'with', 'this']


In [51]:
nlp.Defaults.stop_words.add("btw")
nlp.vocab["btw"].is_stop = True

In [56]:
nlp.vocab["btw"].is_stop

True

## Alphanumeric

In [34]:
print("Alpha Numeric: ", [token.text for token in doc if token.is_alpha])

Alpha Numeric:  ['When', 'Sebastian', 'Thrun', 'started', 'working', 'on', 'self', 'driving', 'cars', 'at', 'Google', 'in', 'few', 'people', 'outside', 'of', 'the', 'company', 'took', 'him', 'seriously', 'I', 'can', 'tell', 'you', 'very', 'senior', 'CEOs', 'of', 'major', 'American', 'car', 'companies', 'would', 'shake', 'my', 'hand', 'and', 'turn', 'away', 'because', 'I', 'was', 'worth', 'talking', 'to', 'said', 'Thrun', 'in', 'an', 'interview', 'with', 'Recode', 'earlier', 'this', 'week']


## Capitalization

In [35]:
print("Capitalized: ", [token.text for token in doc if token.is_title])

Capitalized:  ['When', 'Sebastian', 'Thrun', 'Google', 'I', 'American', 'I', 'Thrun', 'Recode']


## Ponctuation

In [37]:
print("Pontuation: ", [token.text for token in doc if token.is_punct])

Pontuation:  ['-', ',', '.', '“', ',', '”', ',', '.']


## Numerical

In [38]:
print("Numerical: ", [token.text for token in doc if token.like_num])

Numerical:  ['2007']


## Sentence

In [39]:
print("Sentences: ", [sent.text for sent in doc.sents])

Sentences:  ['\n    ', 'When Sebastian Thrun started working on self-driving cars at\n    Google in 2007, few people outside of the company took him\n    seriously.', "“I can tell you very senior CEOs of major American\n    car companies would shake my hand and turn away because I wasn't\n    worth talking to,” said Thrun, in an interview with Recode earlier\n    this week.\n    "]


## Shape

In [41]:
print("Shape: ", [token.shape_ for token in doc])

Shape:  ['\n    ', 'Xxxx', 'Xxxxx', 'Xxxxx', 'xxxx', 'xxxx', 'xx', 'xxxx', '-', 'xxxx', 'xxxx', 'xx', '\n    ', 'Xxxxx', 'xx', 'dddd', ',', 'xxx', 'xxxx', 'xxxx', 'xx', 'xxx', 'xxxx', 'xxxx', 'xxx', '\n    ', 'xxxx', '.', '“', 'X', 'xxx', 'xxxx', 'xxx', 'xxxx', 'xxxx', 'XXXx', 'xx', 'xxxx', 'Xxxxx', '\n    ', 'xxx', 'xxxx', 'xxxx', 'xxxx', 'xx', 'xxxx', 'xxx', 'xxxx', 'xxxx', 'xxxx', 'X', 'xxx', "x'x", '\n    ', 'xxxx', 'xxxx', 'xx', ',', '”', 'xxxx', 'Xxxxx', ',', 'xx', 'xx', 'xxxx', 'xxxx', 'Xxxxx', 'xxxx', '\n    ', 'xxxx', 'xxxx', '.', '\n    ']


## Part of Speech (POS)

In [42]:
print("POS: ", [token.pos_ for token in doc])

POS:  ['SPACE', 'SCONJ', 'PROPN', 'PROPN', 'VERB', 'VERB', 'ADP', 'NOUN', 'PUNCT', 'VERB', 'NOUN', 'ADP', 'SPACE', 'PROPN', 'ADP', 'NUM', 'PUNCT', 'ADJ', 'NOUN', 'ADP', 'ADP', 'DET', 'NOUN', 'VERB', 'PRON', 'SPACE', 'ADV', 'PUNCT', 'PUNCT', 'PRON', 'AUX', 'VERB', 'PRON', 'ADV', 'ADJ', 'NOUN', 'ADP', 'ADJ', 'ADJ', 'SPACE', 'NOUN', 'NOUN', 'AUX', 'VERB', 'PRON', 'NOUN', 'CCONJ', 'VERB', 'ADV', 'SCONJ', 'PRON', 'AUX', 'PART', 'SPACE', 'ADJ', 'VERB', 'ADP', 'PUNCT', 'PUNCT', 'VERB', 'PROPN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'ADV', 'SPACE', 'DET', 'NOUN', 'PUNCT', 'SPACE']


## Dependency Parsing

In [43]:
print("Dependency: ", [token.dep_ for token in doc])

Dependency:  ['dep', 'advmod', 'compound', 'nsubj', 'advcl', 'xcomp', 'prep', 'npadvmod', 'punct', 'amod', 'pobj', 'prep', 'dep', 'pobj', 'prep', 'pobj', 'punct', 'amod', 'nsubj', 'advmod', 'prep', 'det', 'pobj', 'ROOT', 'dobj', 'dep', 'advmod', 'punct', 'punct', 'nsubj', 'aux', 'ccomp', 'dative', 'advmod', 'amod', 'nsubj', 'prep', 'amod', 'amod', 'dep', 'compound', 'pobj', 'aux', 'ccomp', 'poss', 'dobj', 'cc', 'conj', 'advmod', 'mark', 'nsubj', 'advcl', 'neg', 'dep', 'acomp', 'xcomp', 'prep', 'punct', 'punct', 'ROOT', 'nsubj', 'punct', 'prep', 'det', 'pobj', 'prep', 'pobj', 'advmod', 'dep', 'det', 'npadvmod', 'punct', 'dep']


## Lemmatization

In [45]:
print("Lemma: ", [token.lemma_ for token in doc])

Lemma:  ['\n    ', 'when', 'Sebastian', 'Thrun', 'start', 'work', 'on', 'self', '-', 'drive', 'car', 'at', '\n    ', 'Google', 'in', '2007', ',', 'few', 'people', 'outside', 'of', 'the', 'company', 'take', 'he', '\n    ', 'seriously', '.', '"', 'I', 'can', 'tell', 'you', 'very', 'senior', 'ceo', 'of', 'major', 'american', '\n    ', 'car', 'company', 'would', 'shake', 'my', 'hand', 'and', 'turn', 'away', 'because', 'I', 'be', "n't", '\n    ', 'worth', 'talk', 'to', ',', '"', 'say', 'Thrun', ',', 'in', 'an', 'interview', 'with', 'Recode', 'early', '\n    ', 'this', 'week', '.', '\n    ']


## Morphologic

In [46]:
print("Morph: ", [token.morph for token in doc])

Morph:  [, , Number=Sing, Number=Sing, Tense=Past|VerbForm=Fin, Aspect=Prog|Tense=Pres|VerbForm=Part, , Number=Sing, PunctType=Dash, Aspect=Prog|Tense=Pres|VerbForm=Part, Number=Plur, , , Number=Sing, , NumType=Card, PunctType=Comm, Degree=Pos, Number=Plur, , , Definite=Def|PronType=Art, Number=Sing, Tense=Past|VerbForm=Fin, Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs, , , PunctType=Peri, PunctSide=Ini|PunctType=Quot, Case=Nom|Number=Sing|Person=1|PronType=Prs, VerbForm=Fin, VerbForm=Inf, Person=2|PronType=Prs, , Degree=Pos, Number=Plur, , Degree=Pos, Degree=Pos, , Number=Sing, Number=Plur, VerbForm=Fin, VerbForm=Inf, Number=Sing|Person=1|Poss=Yes|PronType=Prs, Number=Sing, ConjType=Cmp, VerbForm=Inf, , , Case=Nom|Number=Sing|Person=1|PronType=Prs, Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin, Polarity=Neg, , Degree=Pos, Aspect=Prog|Tense=Pres|VerbForm=Part, , PunctType=Comm, PunctSide=Fin|PunctType=Quot, Tense=Past|VerbForm=Fin, Number=Sing, PunctType=Comm, , Defin

## Hash

In [64]:
print("Hash: ", [token.orth for token in doc])

Hash:  [16653187200892370574, 10109588199364727116, 12362118327459407872, 16657122835442862368, 17976686883172633439, 16009546102277334650, 5640369432778651323, 10899396943792836290, 9153284864653046197, 10545042384170783628, 13017550054122039847, 11667289587015813222, 16653187200892370574, 11578853341595296054, 3002984154512732771, 4146535246006245184, 2593208677638477497, 11866476999679706272, 7593739049417968140, 12341974070768608367, 886050111519832510, 7425985699627899538, 6905553075311563409, 1524062747651706878, 1739263527992748485, 16653187200892370574, 6593976590948586841, 12646065887601541794, 16073121960489476521, 4690420944186131903, 6635067063807956629, 63172552626595070, 7624161793554793053, 9548244504980166557, 17934676104927248284, 7691720269263716612, 886050111519832510, 8334407161178086546, 8947108287989031391, 16653187200892370574, 17545852598994811774, 8026612326651866097, 6992604926141104606, 9100562122183665148, 227504873216781231, 10690717480206833971, 2283656566

# 🏷️ Entity

In [50]:
print("Entity: ", [(ent.text, ent.label_) for ent in doc.ents])

Entity:  [('Sebastian Thrun', 'PERSON'), ('2007', 'DATE'), ('American\n    ', 'ORG'), ('Thrun', 'GPE'), ('Recode earlier\n    ', 'ORG'), ('this week', 'DATE')]


# ⚗️ Treating Stop Words

In [60]:
stop_words = set(nlp.Defaults.stop_words)
doc_tokens = set([token.text for token in doc])

tokens_without_sw = doc_tokens - stop_words

print("Tokens with stop words: ", doc_tokens)
print("Tokens without stop words: ", tokens_without_sw)

Tokens with stop words:  {'major', 'talking', 'Sebastian', 'him', 'with', 'company', 'senior', '-', 'to', 'my', 'American', 'turn', 'few', 'an', 'outside', 'earlier', 'this', 'CEOs', 'on', '”', 'seriously', 'working', ',', "n't", 'I', '.', '2007', 'was', 'car', 'said', '\n    ', 'you', 'self', 'can', 'When', 'hand', 'because', 'worth', 'driving', '“', 'Recode', 'and', 'very', 'companies', 'interview', 'week', 'away', 'the', 'shake', 'would', 'tell', 'Google', 'at', 'Thrun', 'cars', 'took', 'started', 'in', 'of', 'people'}
Tokens without stop words:  {'major', 'talking', 'Sebastian', 'company', 'senior', '-', 'American', 'turn', 'earlier', 'outside', 'CEOs', '”', 'seriously', 'working', ',', 'I', '.', '2007', 'car', 'said', '\n    ', 'worth', 'self', 'When', 'hand', 'driving', '“', 'Recode', 'companies', 'interview', 'week', 'away', 'shake', 'tell', 'Google', 'Thrun', 'cars', 'took', 'started', 'people'}


# 📖 Vocab

In [65]:
nlp.vocab.strings["major"]

8334407161178086546

In [66]:
nlp.vocab.strings[8334407161178086546]

'major'

In [69]:
print(nlp.vocab["major"].text)
print(nlp.vocab["major"].orth)
print(nlp.vocab["major"].is_alpha)
print(nlp.vocab["major"].is_lower)

major
8334407161178086546
True
True


# 🔗 Word Embedding

## Word Representation in Embedding Space

In [75]:
nlp("major").vector.shape

(300,)

In [76]:
nlp("major").vector

array([  2.1326  ,   0.8682  ,   3.2518  ,  -0.15306 ,   5.3951  ,
         1.268   ,   0.62323 ,   6.0236  ,   1.9574  ,  -6.191   ,
         4.1726  ,   2.7965  ,  -5.2525  ,   0.53829 ,   1.865   ,
         7.8972  ,   3.1153  ,  -0.70744 ,   2.3526  ,   1.1263  ,
         0.30384 ,  -0.51894 ,  -0.28456 ,   1.1389  ,   0.54148 ,
        -0.66436 ,  -4.2116  ,  -3.2915  ,   4.7275  ,   0.70571 ,
         1.2935  ,   1.4519  ,  -0.74874 ,  -1.5491  ,  -5.0295  ,
        -4.0992  ,  -0.53117 ,   2.06    ,  -3.6052  ,   0.503   ,
         3.234   ,  -0.34483 ,   1.4491  ,  -0.80252 ,   0.89753 ,
         1.8189  ,  -4.0104  ,  -0.15509 ,  -0.39501 ,   2.9293  ,
        -5.7403  ,   2.0035  ,  -2.4766  ,  -1.6676  ,  -0.99693 ,
         0.034003,   1.4266  ,   1.1297  ,   3.5961  ,   1.4989  ,
         1.6947  ,  -0.49486 ,  -1.5931  ,  -0.97236 ,   3.2836  ,
         3.6151  ,  -0.51977 ,   0.74662 ,   2.6169  ,   2.2358  ,
        -0.0439  ,  -1.5985  ,  -6.3017  ,  -2.6076  ,   0.894

## Average Word Respresentation in Embedding Space

In [77]:
nlp("When Sebastian Thrun started working on self-driving cars").vector.shape

(300,)

In [78]:
nlp("When Sebastian Thrun started working on self-driving cars").vector

array([-1.9587681 ,  0.22768912, -3.7847328 ,  1.8580348 ,  3.305572  ,
       -0.91748303,  1.4065901 ,  3.6848862 , -0.75567305,  0.6324701 ,
        1.2118471 ,  3.6899238 , -3.6640441 ,  1.8474171 ,  0.2004298 ,
        1.1258616 ,  3.171399  ,  0.01279705,  0.74492204,  0.8504612 ,
        1.1084371 ,  1.3266821 ,  0.96041584, -1.0251776 ,  0.7215211 ,
       -0.73542607, -3.4276137 , -2.072732  ,  0.6211239 ,  0.77024204,
       -2.613421  , -0.18507203,  0.7943541 , -2.3119862 ,  0.6918465 ,
       -1.874637  , -0.194189  ,  0.893058  , -0.14742604, -1.57138   ,
       -0.11396106, -1.0899041 ,  0.99155253,  2.362828  , -0.85148495,
       -2.2445302 , -0.9464598 , -2.324143  ,  1.4878075 ,  1.9275539 ,
        0.893494  ,  1.35953   , -2.156079  , -4.131925  , -1.293541  ,
        1.1010559 ,  0.39936298, -0.15898699,  0.8579501 ,  0.55813897,
        1.0692562 ,  0.31750196, -2.045574  ,  0.507221  , -0.66598   ,
        0.07068102, -2.3957229 , -2.33611   , -0.8478979 ,  1.54

# 🧮 Similarity

In [84]:
doc1 = nlp("When Sebastian Thrun started working on self-driving cars")
doc2 = nlp("When Sebastian started working on self-driving cars")
doc3 = nlp("When Sebastian started working on self-driving cars")
doc4 = nlp("When you drive a car, you must be careful.")

In [95]:
for idx_i, doc_i in enumerate([doc1, doc2, doc3, doc4]):
    print(f"doc{ idx_i + 1 } similarity with:")

    for idx_j, doc_j in enumerate([doc1, doc2, doc3, doc4]):
        print(f"- doc{ idx_j + 1 } = { doc_i.similarity(doc_j) }") if idx_i != idx_j else None

    print("\n")

doc1 similarity with:
- doc2 = 0.5630191054503384
- doc3 = 0.9992232075383632
- doc4 = 0.5630191054503384


doc2 similarity with:
- doc1 = 0.5630191054503384
- doc3 = 0.5699144521054758
- doc4 = 1.0


doc3 similarity with:
- doc1 = 0.9992232075383632
- doc2 = 0.5699144521054758
- doc4 = 0.5699144521054758


doc4 similarity with:
- doc1 = 0.5630191054503384
- doc2 = 1.0
- doc3 = 0.5699144521054758


