# spaCy 介绍
https://spacy.io/
比NLTK效率更高，可以与深度学习结合


In [1]:
#导入工具包和英文模型
#导入前：python -m spacy download en 用管理员身份打开promt/cmd

import spacy
nlp = spacy.load('en_core_web_sm')

KeyboardInterrupt: 

## 文本处理

In [None]:
doc = nlp('Weather is good, very windy and sunny. We have no classes in the afternoon.') #默认分词过了

In [None]:
# 分词
for token in doc:
    print(token)

Weather
is
good
,
very
windy
and
sunny
.
We
have
no
classes
in
the
afternoon
.


In [None]:
# 分句
for sentence in doc.sents:
    print(sentence)

Weather is good, very windy and sunny.
We have no classes in the afternoon.


## 词性
[参考链接](http://www.winwaed.com/blog/2011/11/08/part-of-speech-tags/).

In [None]:
for token in doc:
    print(f'{token}-{token.pos_}')

Weather-NOUN
is-AUX
good-ADJ
,-PUNCT
very-ADV
windy-ADJ
and-CCONJ
sunny-ADJ
.-PUNCT
We-PRON
have-VERB
no-DET
classes-NOUN
in-ADP
the-DET
afternoon-NOUN
.-PUNCT


## 命名体识别

In [None]:
doc_2 = nlp('I went to Paris where I met my old friend Jack from uni.')
for ent in doc_2.ents: #ent 实体
    print(f'{ent}-{ent.label_}')

Paris-GPE
Jack-PERSON


In [None]:
from spacy import displacy

displacy.render(doc_2,style='ent',jupyter=True)

### 案例：找出书中所有任务的名字

In [None]:
# 读取语料
def read_file(file_name):
    with open(file_name,'r') as file:
        return file.read()

In [None]:
# 加载文本数据
text = read_file('./data/pride_and_prejudice.txt')
processed_text = nlp(text)

In [None]:
# 基本统计
sentences = [s for s in processed_text.sents]
print(len(sentences))

5311


In [None]:
sentences[:5]

[The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen
 
 This eBook is for the use of anyone anywhere at no cost and with
 almost no restrictions whatsoever.  ,
 You may copy it, give it away or
 re-use it under the terms of the Project Gutenberg License included
 with this eBook or online at www.gutenberg.org
 
 
 Title: Pride and Prejudice
 
 Author: Jane Austen
 
 Posting Date: August 26, 2008 [EBook #1342]
 Release Date: June, 1998
 Last updated: February 15, 2015]
 
 Language: English
 
 
 ***,
 START OF THIS PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE ***
 
 
 
 
 Produced by Anonymous Volunteers
 
 
 
 
 
 PRIDE AND PREJUDICE
 
 By Jane Austen
 
 
 
 Chapter 1
 
 
 It is a truth universally acknowledged, that a single man in possession
 of a good fortune, must be in want of a wife.
 ,
 However little known the feelings or views of such a man may be on his
 first entering a neighbourhood, this truth is so well fixed in the minds
 of the surrounding families, that he

In [None]:
from collections import Counter #计数器

In [None]:
# 数人名有多少个
def find_person(doc):
    c = Counter()
    for ent in processed_text.ents:
        if ent.label_ == 'PERSON':
            c[ent.lemma_] += 1
    return c.most_common(10)

print(find_person(processed_text))

[('Darcy', 398), ('Elizabeth', 324), ('Jane', 267), ('Bennet', 256), ('Collins', 174), ('Bingley', 161), ('Wickham', 108), ('Lizzy', 92), ('Gardiner', 91), ('Lady Catherine', 81)]


### 案例：恐怖袭击文本分析

In [None]:
def read_file_to_list(file_name):
    with open(file_name,'r') as file:
        return file.readlines()

In [None]:
terrosim_articles = read_file_to_list('data/rand-terrorism-dataset.txt')

In [None]:
terrosim_articles[:5]

['CHILE.  An explosion from a single stick of dynamite went off on the patio of the Santiago Binational Center, causing $21,000 in damages.\n',
 'ISRAEL.  Palestinian terrorists fired five mortar shells into the collective settlement at Masada, causing slight damage but no injuries.\n',
 'GUATEMALA.  A bomb was thrown over the wall surrounding the U.S. Marines guards house in Guatemala City, causing damage but no injuries.\n',
 'FRANCE.  Five French students bombed the Paris offices of   Chase Manhattan Bank before dawn.  Trans-World Airways and the Bank of America were also bombed.   They claimed to be protesting the U.S. involvement in the Vietnam war.\n',
 'UNITED STATES - Unidentified anti-Castro Cubans attempted to bomb the Miami branch of the Spanish National Tourist Office.\n']

In [None]:
common_terrorist_groups = [
    'taliban', 
    'al - qaeda', 
    'hamas',  
    'fatah', 
    'plo', 
    'bilad al - rafidayn'
]

common_locations = [
    'iraq',
    'baghdad', 
    'kirkuk', 
    'mosul', 
    'afghanistan', 
    'kabul',
    'basra', 
    'palestine', 
    'gaza', 
    'israel', 
    'istanbul', 
    'beirut', 
    'pakistan'
]