In [5]:
import spacy
import pandas as pd

In [8]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

text = []
lemma_ = []
pos_ = []
tag_ = []
dep_ = []
shape_ = []
is_alpha = []
is_stop = []

for token in doc:
    text.append(token.text)
    lemma_.append(token.lemma_)
    pos_.append(token.pos_)
    tag_.append(token.tag_)
    dep_.append(token.dep_)
    shape_.append(token.shape_)
    is_alpha.append(token.is_alpha)
    is_stop.append(token.is_stop)
    
df = pd.DataFrame({
    "Text": text,
    "Lemma": lemma_,
    "Pos": pos_,
    "Tag": tag_,
    "Dep": dep_,
    "Shape": shape_,
    "Alpha": is_alpha,
    "Stop": is_stop
})

df

Unnamed: 0,Text,Lemma,Pos,Tag,Dep,Shape,Alpha,Stop
0,Apple,Apple,PROPN,NNP,nsubj,Xxxxx,True,False
1,is,be,AUX,VBZ,aux,xx,True,True
2,looking,look,VERB,VBG,ROOT,xxxx,True,False
3,at,at,ADP,IN,prep,xx,True,True
4,buying,buy,VERB,VBG,pcomp,xxxx,True,False
5,U.K.,U.K.,PROPN,NNP,compound,X.X.,False,False
6,startup,startup,NOUN,NN,dobj,xxxx,True,False
7,for,for,ADP,IN,prep,xxx,True,True
8,$,$,SYM,$,quantmod,$,False,False
9,1,1,NUM,CD,compound,d,False,False


Text: The original word text.

Lemma: The base form of the word

POS: The simple part-of-speech tag.

Tag: The detailed part-of-speech tag.

Dep: Syntactic dependency, i.e. the relation between tokens.

Shape: The word shape – capitalization, punctuation, digits.

is alpha: Is the token an alpha character?

is stop: Is the token part of a stop list, i.e. the most common words of the language?

## Named Entities

In [9]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


## Word Vectors and Similarity

In [10]:
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 19.266302 True
cat True 19.220264 True
banana True 17.748499 True
afskfsd True 20.882004 True


Text: The original token text.
    
has vector: Does the token have a vector representation?

Vector norm: The L2 norm of the token’s vector (the square root of the sum of the values squared)
    
OOV: Out-of-vocabulary

In [11]:
tokens = nlp("dog cat banana")

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.47130835
dog banana 0.32758623
cat dog 0.47130835
cat cat 1.0
cat banana 0.35478392
banana dog 0.32758623
banana cat 0.35478392
banana banana 1.0


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
