[Reference](https://medium.com/nlplanet/two-minutes-nlp-spacy-cheat-sheet-21471dac7837)

In [3]:
!python -m spacy download en_core_web_sm

2023-11-21 01:37:03.252989: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-21 01:37:03.253045: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-21 01:37:03.253069: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now l

# Tokenization

In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The cat is on the table")
for token in doc:
    print(token.text)

The
cat
is
on
the
table


# POS Tagging

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The cat is on the table")
for token in doc:
    print(f"{token.text} --- POS: {token.pos_}, {token.tag_}")

The --- POS: DET, DT
cat --- POS: NOUN, NN
is --- POS: AUX, VBZ
on --- POS: ADP, IN
the --- POS: DET, DT
table --- POS: NOUN, NN


# Dependency Parsing

In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The cat is on the table")
for token in doc:
    print(f"{token.text} --- dependency label: {token.dep_}")

The --- dependency label: det
cat --- dependency label: nsubj
is --- dependency label: ROOT
on --- dependency label: prep
the --- dependency label: det
table --- dependency label: pobj


# Stopwords

In [6]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The cat is on the table")
for token in doc:
    print(f"{token.text} --- is stopword: {token.is_stop}")

The --- is stopword: True
cat --- is stopword: False
is --- is stopword: True
on --- is stopword: True
the --- is stopword: True
table --- is stopword: False


# Lemmatization

In [7]:

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The cat is on the table")
for token in doc:
    print(f"{token.text} --- lemma: {token.lemma_}")

The --- lemma: the
cat --- lemma: cat
is --- lemma: be
on --- lemma: on
the --- lemma: the
table --- lemma: table


# Named Entity Recognition (NER)

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Elon Musk cofounded the electronic-payment firm PayPal and formed SpaceX.")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Elon Musk 0 9 PERSON
PayPal 48 54 ORG


# Word embeddings

In [9]:
!python -m spacy download en_core_web_md

2023-11-21 01:42:52.549181: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-21 01:42:52.549252: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-21 01:42:52.549281: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully instal

In [10]:
import spacy

nlp = spacy.load("en_core_web_md")
tokens = nlp("The cat is on the aofafgag")

vectors = []
for token in tokens:
    print(token.text, token.has_vector, token.is_oov)
    vectors.append(token.vector)

The True False
cat True False
is True False
on True False
the True False
aofafgag False True


In [11]:
print(vectors[0])

[-7.2681e+00 -8.5717e-01  5.8105e+00  1.9771e+00  8.8147e+00 -5.8579e+00
  3.7143e+00  3.5850e+00  4.7987e+00 -4.4251e+00  1.7461e+00 -3.7296e+00
 -5.1407e+00 -1.0792e+00 -2.5555e+00  3.0755e+00  5.0141e+00  5.8525e+00
  7.3378e+00 -2.7689e+00 -5.1641e+00 -1.9879e+00  2.9782e+00  2.1024e+00
  4.4306e+00  8.4355e-01 -6.8742e+00 -4.2949e+00 -1.7294e-01  3.6074e+00
  8.4379e-01  3.3419e-01 -4.8147e+00  3.5683e-02 -1.3721e+01 -4.6528e+00
 -1.4021e+00  4.8342e-01  1.2549e+00 -4.0644e+00  3.3278e+00 -2.1590e-01
 -5.1786e+00  3.5360e+00 -3.1575e+00 -3.5273e+00 -3.6753e+00  1.5863e+00
 -8.1594e+00 -3.4657e+00  1.5262e+00  4.8135e+00 -3.8428e+00 -3.9082e+00
  6.7549e-01 -3.5787e-01 -1.7806e+00  3.5284e+00 -5.1114e-02 -9.7150e-01
 -9.0553e-01 -1.5570e+00  1.2038e+00  4.7708e+00  9.8561e-01 -2.3186e+00
 -7.4899e+00 -9.5389e+00  8.5572e+00  2.7420e+00 -3.6270e+00  2.7456e+00
 -6.9574e+00 -1.7190e+00 -2.9145e+00  1.1838e+00  3.7864e+00  2.0413e+00
 -3.5808e+00  1.4319e+00  2.0528e-01 -7.0640e-01 -5

# Sentence similarity

In [12]:
import spacy

nlp = spacy.load("en_core_web_md")  # make sure to use larger package!
doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")
doc3 = nlp("Where is the cat.")

# Similarity of doc1 and doc2
print(doc1.similarity(doc2))

0.691649353055761


In [13]:
# Similarity of doc1 and doc3
print(doc1.similarity(doc3))

0.48942441701454426
