## CREATED BY HCY 20200727
## BACKGROUND 学习Spacy模块

In [1]:
# !pip install -U spacy
# !pip install -U spacy-lookups-data
# !python -m spacy download en_core_web_sm

In [2]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp('Apple is looking at buying U.K. starup for $1 billion')

In [5]:
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
starup
for
$
1
billion


In [6]:
doc = nlp("Apple isn't looking at buying U.K. starup for $1 billion")

In [7]:
for token in doc:
    print(token.text)

Apple
is
n't
looking
at
buying
U.K.
starup
for
$
1
billion


## Part-of_Speech[POS] Tagging

In [8]:
doc

Apple isn't looking at buying U.K. starup for $1 billion

In [9]:
for token in doc:
    print(token.text, token.lemma_)

Apple Apple
is be
n't not
looking look
at at
buying buy
U.K. U.K.
starup starup
for for
$ $
1 1
billion billion


In [10]:
for token in doc:
    print(f'{token.text:{15}} {token.lemma_:{10}} {token.pos_:{10}}')

Apple           Apple      PROPN     
is              be         AUX       
n't             not        PART      
looking         look       VERB      
at              at         ADP       
buying          buy        VERB      
U.K.            U.K.       PROPN     
starup          starup     NOUN      
for             for        ADP       
$               $          SYM       
1               1          NUM       
billion         billion    NUM       


In [11]:
for token in doc:
    print(f'{token.text:{15}} {token.lemma_:{10}} {token.pos_:{10}} {token.tag_:{15}} {token.dep_:{10}} {token.shape_:{10}} {token.is_alpha:{10}} {token.is_stop:{10}}')

Apple           Apple      PROPN      NNP             nsubj      Xxxxx               1          0
is              be         AUX        VBZ             aux        xx                  1          1
n't             not        PART       RB              neg        x'x                 0          1
looking         look       VERB       VBG             ROOT       xxxx                1          0
at              at         ADP        IN              prep       xx                  1          1
buying          buy        VERB       VBG             pcomp      xxxx                1          0
U.K.            U.K.       PROPN      NNP             compound   X.X.                0          0
starup          starup     NOUN       NN              dobj       xxxx                1          0
for             for        ADP        IN              prep       xxx                 1          1
$               $          SYM        $               quantmod   $                   0          0
1               1   

In [12]:
from spacy import displacy
displacy.render(doc, style="dep")

In [13]:
import spacy
nlp = spacy.load('en_core_web_lg')  # make sure to use larger model!
tokens = nlp("dog cat banana")

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.80168545
dog banana 0.24327643
cat dog 0.80168545
cat cat 1.0
cat banana 0.28154364
banana dog 0.24327643
banana cat 0.28154364
banana banana 1.0


In [14]:
for token in tokens:
    print(token.orth, "\t", token.i, "\t", token.ent_type, "\t", token.lemma_, "\t", token.norm_, "\t", token.pos_, "\t", token.tag_)

7562983679033046312 	 0 	 0 	 dog 	 dog 	 NOUN 	 NN
5439657043933447811 	 1 	 0 	 cat 	 cat 	 NOUN 	 NN
2525716904149915114 	 2 	 0 	 banana 	 banana 	 PROPN 	 NNP


##  Vocab 类

In [41]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [46]:
## apple是一个Lexeme对象，vocab还包含一个strings属性，用于表示把单词映射到64位的哈希值，这使得每一个单词在spaCy中只存储一份。
apple = nlp.vocab[u'apple']

### 1，Lexeme类型

In [50]:
# Lexeme对象是词汇表Vocab中的一个词条（entry），可以通过该similarity()函数计算两个词条的相似性：
import spacy
nlp = spacy.load("en_core_web_lg")


apple = nlp.vocab[u"apple"]
orange = nlp.vocab[u"orange"]
apple_oranges = apple.similarity(orange)

#### Lexeme对象的属性，通常属性是成对存在的，不带下划线的是属性的ID形式，带下划线的是属性的文本形式：

* text：文本内容（Verbatim text content）
* orth、orth_：文本ID和文本内容
* lower、lower_：文本的小写
* is_alpha、is_ascii、is_digit、is_lower、is_upper、is_title、is_punct、is_space：指示文本的类型，返回值是boolean类型
* like_url、like_num、like_email：指示文本是否是url、数字和email，返回值是boolean类型
* sentiment：标量值，用于指示词汇的积极性
* cluster：布朗Cluster ID

### 2，StringStore类型
* StringStore类是一个string-to-int的对象，通过64位的哈希值来查找词汇，或者把词汇映射到64位的哈希值：

In [53]:
from spacy.strings import StringStore
stringstore = StringStore(["apple", "orange"])
apple_hash = stringstore["apple"]
apple_hash

8566208034543834098

* Vocab的strings属性是一个StringStore对象，用于存储共享的词汇数据：

In [55]:
apple_id = nlp.vocab.strings['apple']
print(apple_id)
apple_text = nlp.vocab.strings[8566208034543834098]
print(apple_text)

8566208034543834098
apple


### 3，Vocab类
* 在初始化Vocab类时，传递参数strings是list或者StringStore对象，得到Vocab对象：

In [56]:
from spacy.vocab import Vocab
vocab = Vocab(strings=["apple", "orange"])
vocab.strings[u"apple"]

8566208034543834098

## 二，Token类
* Token是一个单词、标点符号、空格等，在自然语言处理中，把一个单词，一个标点符号，一个空格等叫做一个token

In [58]:
import spacy
nlp = spacy.load("en_core_web_lg")
doc = nlp("I like apples and oranges")
token_apple = doc[2]
token_orange = doc[4]
print(token_apple, token_orange)

apples oranges


### 1，Token对象的函数

In [59]:
#### 计算不同token之间的语义相似性
token_apple.similarity(token_orange)

0.77809423

In [62]:
#### 对一段文本，获得相邻的token，默认情况下，得到的是下一个相邻的token
print(token_apple.nbor())

and


In [63]:
### 从一段文本种，获得相连的token
print(token_apple.conjuncts)

(oranges,)


### 2，Token对象的属性

Token对象，除了具有Lexeme对象属性之外，还具有Token对象特有的属性：

* doc：父doc
* sent：token所在的Span对象
* text：文本
* orth、orth_：文本ID和文本
* i：token在父doc中的索引
* ent_type、ent_type_：命名实体类型
* lemma、lemma_：token的基本形式（base form）
* norm、norm_：token的标准化形式
* pos、pos_：token的词性（Coarse-grained POS）
* tag、tag_：token的词性（Fine-grained POS）
* lower、lower_：token的小写形式
* is_alpha、is_ascii、is_digit、is_lower、is_upper、is_title、is_punct、is_space
* like_url、like_num、like_email
* sentiment

In [65]:
print(token_apple.doc)

I like apples and oranges


In [66]:
print([token.text for token in doc])

['I', 'like', 'apples', 'and', 'oranges']


In [70]:
span_apple = token_apple.sent
span_apple

I like apples and oranges

In [71]:
print(token_apple.orth, token_apple.orth_)

14374618037326464786 apples


In [85]:
print(token_apple.i)
span = spacy.tokens.Span(doc, 0, 3)
print(span.start, span.end, span.start_char, span.end_char)

2
0 3 0 13


In [87]:
print(token_apple.ent_type, token_apple.ent_type_)

0 


In [88]:
print(token_apple.lemma, token_apple.lemma_)

8566208034543834098 apple


In [89]:
print(token_apple.norm, token_apple.norm_)

14374618037326464786 apples


In [90]:
print(token_apple.pos, token_apple.pos_) ## token的词性， 颗粒度粗
print(token_apple.tag, token_apple.tag_) ## token的词性， 颗粒度细

92 NOUN
783433942507015291 NNS


## DOC类
* 对一个文本数据进行分词之后，Doc对象是token的序列，Span对象是Doc对象的一个切片：

In [15]:
nlp = spacy.load('en_core_web_sm')
doc = nlp("I like apples and oranges")

span = doc[0:3]  ## 前闭后开
span

I like apples

* 获得文本的命名实体
* doc.ents
* 获得文本的名词块
* doc.noun_chunks
* 获得文本的句子
* doc.sents
* 查看doc的文本
* doc.text

In [16]:
print(doc.ents)
print(doc.noun_chunks)
print(doc.sents)
print(doc.text)

()
<generator object at 0x000001F96BB11E58>
<generator object at 0x000001F903A98798>
I like apples and oranges


# Span 类

### Span对象是Doc对象的一个切片，Span对象的属性：

#### start：span的第一个token在doc中的索引
#### end：span的最后一个token在doc中的索引
#### text：span的文本
#### orth、orth_：span的文本
#### lemma_：span的lemma

In [17]:
nlp = spacy.load('en_core_web_sm')
doc = nlp("I love dogs and cats")

for token in doc:
    print(token.text, type(doc[0:3]))

I <class 'spacy.tokens.span.Span'>
love <class 'spacy.tokens.span.Span'>
dogs <class 'spacy.tokens.span.Span'>
and <class 'spacy.tokens.span.Span'>
cats <class 'spacy.tokens.span.Span'>


### Matcher 类似于正则表达式，返回text或者sentence中指定格式的内容 

In [18]:
nlp = spacy.load("en_core_web_sm")

In [19]:
doc = nlp("Hello, World!")

In [20]:
doc

Hello, World!

In [21]:
for token in doc:
    print(token)

Hello
,
World
!


In [22]:
## OP代表可选的意思？？， 
## '！'：通过要求精确匹配0次来对模式进行求反。
##'？'：允许模式匹配0或1次，从而使其成为可选模式。
##'+'：要求模式匹配1次或多次。
## '*'：允许模式零次或多次。
pattern = [{"LOWER": "hello", 'OP': '?'}, {"IS_PUNCT": True, 'OP': '?'}, {"LOWER": "world"}]

In [23]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', None, pattern)

In [24]:
matches = matcher(doc)
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [25]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start: end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, World
15578876784678163569 HelloWorld 1 3 , World
15578876784678163569 HelloWorld 2 3 World


## 正则表达式

In [26]:
text = "1234my phone number is 1256. \s Ohh its wrong! Correct one is 1234567890. call me!"

In [27]:
import re

In [28]:
pattern = re.compile(r'[a-zA-Z]{2}')

In [29]:
pattern1 = re.compile(r'^\d{4}')

In [30]:
re.findall(pattern1, text)

['1234']

In [31]:
pattern2 = re.compile(r'\\s')   ## compile里加r, re.compile(r'\\s') 和 re.compile('\\\s')效果一样

In [32]:
re.findall(pattern2, text)

['\\s']

In [33]:
text = "you can get free-videos on kgp-talkie"

re.findall(r'[\w]+-[\w]+', text)

['free-videos', 'kgp-talkie']

## Spacy 正则表达式

In [34]:
text = "Google announced a new Pixel at Google I/O. Google I/O is a great place to get all updates from Google"

In [35]:
text

'Google announced a new Pixel at Google I/O. Google I/O is a great place to get all updates from Google'

In [36]:
pattern = [{'TEXT': 'Google'}, {'TEXT': 'I'}, {'TEXT': '/'}, {'TEXT': 'O', 'OP': '+'}]

In [37]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [38]:
matcher = Matcher(nlp.vocab)
matcher.add('Google', callback_method, pattern)

In [39]:
doc = nlp(text)

In [40]:
matcher(doc)

Google I/O


[(11578853341595296054, 10, 14)]