## CREATED BY HCY 20200727
## BACKGROUND 学习Spacy模块

In [1]:
# !pip install -U spacy
# !pip install -U spacy-lookups-data
# !python -m spacy download en_core_web_sm

In [2]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp('Apple is looking at buying U.K. starup for $1 billion')

In [5]:
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
starup
for
$
1
billion


In [6]:
doc = nlp("Apple isn't looking at buying U.K. starup for $1 billion")

In [7]:
for token in doc:
    print(token.text)

Apple
is
n't
looking
at
buying
U.K.
starup
for
$
1
billion


## Part-of_Speech[POS] Tagging

In [8]:
doc

Apple isn't looking at buying U.K. starup for $1 billion

In [9]:
for token in doc:
    print(token.text, token.lemma_)

Apple Apple
is be
n't not
looking look
at at
buying buy
U.K. U.K.
starup starup
for for
$ $
1 1
billion billion


In [10]:
for token in doc:
    print(f'{token.text:{15}} {token.lemma_:{10}} {token.pos_:{10}}')

Apple           Apple      PROPN     
is              be         AUX       
n't             not        PART      
looking         look       VERB      
at              at         ADP       
buying          buy        VERB      
U.K.            U.K.       PROPN     
starup          starup     NOUN      
for             for        ADP       
$               $          SYM       
1               1          NUM       
billion         billion    NUM       


In [11]:
for token in doc:
    print(f'{token.text:{15}} {token.lemma_:{10}} {token.pos_:{10}} {token.tag_:{15}} {token.dep_:{10}} {token.shape_:{10}} {token.is_alpha:{10}} {token.is_stop:{10}}')

Apple           Apple      PROPN      NNP             nsubj      Xxxxx               1          0
is              be         AUX        VBZ             aux        xx                  1          1
n't             not        PART       RB              neg        x'x                 0          1
looking         look       VERB       VBG             ROOT       xxxx                1          0
at              at         ADP        IN              prep       xx                  1          1
buying          buy        VERB       VBG             pcomp      xxxx                1          0
U.K.            U.K.       PROPN      NNP             compound   X.X.                0          0
starup          starup     NOUN       NN              dobj       xxxx                1          0
for             for        ADP        IN              prep       xxx                 1          1
$               $          SYM        $               quantmod   $                   0          0
1               1   

In [12]:
from spacy import displacy
displacy.render(doc, style="dep")

In [13]:
import spacy
nlp = spacy.load('en_core_web_lg')  # make sure to use larger model!
tokens = nlp("dog cat banana")

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.80168545
dog banana 0.24327643
cat dog 0.80168545
cat cat 1.0
cat banana 0.28154364
banana dog 0.24327643
banana cat 0.28154364
banana banana 1.0


In [43]:
for token in tokens:
    print(token.orth, "\t", token.i, "\t", token.ent_type, "\t", token.lemma_, "\t", token.norm_, "\t", token.pos_, "\t", token.tag_)

7562983679033046312 	 0 	 0 	 dog 	 dog 	 NOUN 	 NN
5439657043933447811 	 1 	 0 	 cat 	 cat 	 NOUN 	 NN
2525716904149915114 	 2 	 0 	 banana 	 banana 	 PROPN 	 NNP


## DOC类

In [44]:
nlp = spacy.load('en_core_web_sm')
doc = nlp("I like apples and oranges")

span = doc[0:3]  ## 前闭后开
span

I like apples

#### 获得文本的命名实体
##### doc.ents
#### 获得文本的名词块
##### doc.noun_chunks
#### 获得文本的句子
##### doc.sents
#### 查看doc的文本
##### doc.text

In [46]:
print(doc.ents)
print(doc.noun_chunks)
print(doc.sents)
print(doc.text)

()
<generator object at 0x000002384EBDADC8>
<generator object at 0x000002384F090168>
I like apples and oranges


# Span 类

### Span对象是Doc对象的一个切片，Span对象的属性：

#### start：span的第一个token在doc中的索引
#### end：span的最后一个token在doc中的索引
#### text：span的文本
#### orth、orth_：span的文本
#### lemma_：span的lemma

In [47]:
nlp = spacy.load('en_core_web_sm')
doc = nlp("I love dogs and cats")

for token in doc:
    print(token.text, type(doc[0:3]))

I <class 'spacy.tokens.span.Span'>
love <class 'spacy.tokens.span.Span'>
dogs <class 'spacy.tokens.span.Span'>
and <class 'spacy.tokens.span.Span'>
cats <class 'spacy.tokens.span.Span'>


### Matcher 类似于正则表达式，返回text或者sentence中指定格式的内容 

In [58]:
nlp = spacy.load("en_core_web_sm")

In [81]:
doc = nlp("Hello, World!")

In [82]:
doc

Hello, World!

In [83]:
for token in doc:
    print(token)

Hello
,
World
!


In [84]:
## OP代表可选的意思？？， 
## '！'：通过要求精确匹配0次来对模式进行求反。
##'？'：允许模式匹配0或1次，从而使其成为可选模式。
##'+'：要求模式匹配1次或多次。
## '*'：允许模式零次或多次。
pattern = [{"LOWER": "hello", 'OP': '?'}, {"IS_PUNCT": True, 'OP': '?'}, {"LOWER": "world"}]

In [85]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', None, pattern)

In [86]:
matches = matcher(doc)
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [87]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start: end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, World
15578876784678163569 HelloWorld 1 3 , World
15578876784678163569 HelloWorld 2 3 World


## 正则表达式

In [113]:
text = "1234my phone number is 1256. \s Ohh its wrong! Correct one is 1234567890. call me!"

In [89]:
import re

In [90]:
pattern = re.compile(r'[a-zA-Z]{2}')

In [102]:
pattern1 = re.compile(r'^\d{4}')

In [104]:
re.findall(pattern1, text)

['1234']

In [123]:
pattern2 = re.compile(r'\\s')   ## compile里加r, re.compile(r'\\s') 和 re.compile('\\\s')效果一样

In [124]:
re.findall(pattern2, text)

['\\s']

In [126]:
text = "you can get free-videos on kgp-talkie"

re.findall(r'[\w]+-[\w]+', text)

['free-videos', 'kgp-talkie']

## Spacy 正则表达式

In [127]:
text = "Google announced a new Pixel at Google I/O. Google I/O is a great place to get all updates from Google"

In [128]:
text

'Google announced a new Pixel at Google I/O. Google I/O is a great place to get all updates from Google'

In [154]:
pattern = [{'TEXT': 'Google'}, {'TEXT': 'I'}, {'TEXT': '/'}, {'TEXT': 'O', 'OP': '+'}]

In [155]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [156]:
matcher = Matcher(nlp.vocab)
matcher.add('Google', callback_method, pattern)

In [157]:
doc = nlp(text)

In [158]:
matcher(doc)

Google I/O


[(11578853341595296054, 10, 14)]