In [1]:
import spacy
text = """
Dave watched as the forest burned up on the hill,
only a few miles from his house. The car had
been hastily packed and Marta was inside trying to round
up the last of the pets. "Where could she be?" he wondered
as he continued to wait for Marta to appear with the pets. 
"""

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
token_list = [token for token in doc]
token_list
# Tokenization of the text

[,
 Dave,
 watched,
 as,
 the,
 forest,
 burned,
 up,
 on,
 the,
 hill,
 ,,
 ,
 only,
 a,
 few,
 miles,
 from,
 his,
 house,
 .,
 The,
 car,
 had,
 ,
 been,
 hastily,
 packed,
 and,
 Marta,
 was,
 inside,
 trying,
 to,
 round,
 ,
 up,
 the,
 last,
 of,
 the,
 pets,
 .,
 ",
 Where,
 could,
 she,
 be,
 ?,
 ",
 he,
 wondered,
 ,
 as,
 he,
 continued,
 to,
 wait,
 for,
 Marta,
 to,
 appear,
 with,
 the,
 pets,
 .,
 ]

In [2]:
# Removes stop words
filtered_tokens = [token for token in doc if not token.is_stop]
filtered_tokens

[,
 Dave,
 watched,
 forest,
 burned,
 hill,
 ,,
 ,
 miles,
 house,
 .,
 car,
 ,
 hastily,
 packed,
 Marta,
 inside,
 trying,
 round,
 ,
 pets,
 .,
 ",
 ?,
 ",
 wondered,
 ,
 continued,
 wait,
 Marta,
 appear,
 pets,
 .,
 ]

In [3]:
# Normalizing words
lemmas = [
    f"Token: {token}, lemma: {token.lemma}"
    for token in filtered_tokens
]
lemmas

['Token: \n, lemma: 962983613142996970',
 'Token: Dave, lemma: 15237984737769454380',
 'Token: watched, lemma: 2054481287215635300',
 'Token: forest, lemma: 12560106647199032635',
 'Token: burned, lemma: 12905682277821018784',
 'Token: hill, lemma: 1647358963876657122',
 'Token: ,, lemma: 2593208677638477497',
 'Token: \n, lemma: 962983613142996970',
 'Token: miles, lemma: 15996833532744392865',
 'Token: house, lemma: 9471806766518506264',
 'Token: ., lemma: 12646065887601541794',
 'Token: car, lemma: 17545852598994811774',
 'Token: \n, lemma: 962983613142996970',
 'Token: hastily, lemma: 16524687012062183671',
 'Token: packed, lemma: 11929990034961539164',
 'Token: Marta, lemma: 3686051643097225522',
 'Token: inside, lemma: 3410355712981309345',
 'Token: trying, lemma: 4812066089261065646',
 'Token: round, lemma: 10404471077220350636',
 'Token: \n, lemma: 962983613142996970',
 'Token: pets, lemma: 8199115189604440881',
 'Token: ., lemma: 12646065887601541794',
 'Token: ", lemma: 15884

In [4]:
# Vectorizing text
filtered_tokens[1].vector

array([ 1.8371646 ,  1.4529226 , -1.6147211 ,  0.678362  , -0.6594443 ,
        1.6417935 ,  0.5796405 ,  2.3021278 , -0.13260496,  0.5750932 ,
        1.5654886 , -0.6938864 , -0.59607106, -1.5377437 ,  1.9425622 ,
       -2.4552505 ,  1.2321601 ,  1.0434952 , -1.5102385 , -0.5787632 ,
        0.12055647,  3.6501784 ,  2.6160972 , -0.5710199 , -1.5221789 ,
        0.00629176,  0.22760668, -1.922073  , -1.6252862 , -4.226225  ,
       -3.495663  , -3.312053  ,  0.81387717, -0.00677544, -0.11603224,
        1.4620426 ,  3.0751472 ,  0.35958546, -0.22527039, -2.743926  ,
        1.269633  ,  4.606786  ,  0.34034157, -2.1272311 ,  1.2619178 ,
       -4.209798  ,  5.452852  ,  1.6940253 , -2.5972986 ,  0.95049495,
       -1.910578  , -2.374927  , -1.4227567 , -2.2528825 , -1.799806  ,
        1.607501  ,  2.9914255 ,  2.8065152 , -1.2510269 , -0.54964066,
       -0.49980402, -1.3882618 , -0.470479  , -2.9670253 ,  1.7884955 ,
        4.5282774 , -1.2602427 , -0.14885521,  1.0419178 , -0.08