## spacy 
1. [Linguistic Feature](https://spacy.io/usage/linguistic-features)

In [107]:
#!pip install spacy
#!python -m spacy download en_core_web_sm --user
#!python -m spacy download en_core_web_md --user
#!python -m spacy download en_core_web_lg --user
#!python -m spacy download en --user

Collecting en_core_web_lg==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz#egg=en_core_web_lg==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz (826.9MB)
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py): started
  Building wheel for en-core-web-lg (setup.py): still running...
  Building wheel for en-core-web-lg (setup.py): still running...
  Building wheel for en-core-web-lg (setup.py): still running...
  Building wheel for en-core-web-lg (setup.py): finished with status 'done'
  Stored in directory: C:\Users\qqoao\AppData\Local\Temp\pip-ephem-wheel-cache-01aqp9j7\wheels\b4\d7\70\426d313a459f82ed5e06cc36a50e2bb2f0ec5cb31d8e0bdf09
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.1.0
[+] Download and installation suc

In [2]:
import spacy

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

In [46]:
# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

In [47]:
doc

When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.

In [72]:
print([token.lemma_ for token in doc if token.pos_=='VERB']) #VERB, NOUN, ADJ, ADV, ADP

['start', 'work', 'drive', 'take', 'can', 'tell', 'would', 'shake', 'turn', 'be', 'talk', 'say']


In [5]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'can', 'tell', 'would', 'shake', 'turn', 'be', 'talk', 'say']


In [6]:
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun PERSON
Recode ORG
earlier this week DATE


In [23]:
import pandas as pd

In [41]:
df = pd.DataFrame()
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
for token in doc:
    d = {'4_CHILDREN' : [child for child in token.children], 
         '3_HEAD POS' : token.head.pos_,
         '2_DEP' : token.dep_,
         '1_HEAD TEXT' : token.head.text, 
         '0_TEXT' : token.text}
    df = df.append(d, ignore_index=True)
    #print(token.text, token.dep_, token.head.text, token.head.pos_,
    #        [child for child in token.children])

Autonomous amod cars NOUN []
cars nsubj shift VERB [Autonomous]
shift ROOT shift VERB [cars, liability]
insurance compound liability NOUN []
liability dobj shift VERB [insurance, toward]
toward prep liability NOUN [manufacturers]
manufacturers pobj toward ADP []


In [42]:
df

Unnamed: 0,0_TEXT,1_HEAD TEXT,2_DEP,3_HEAD POS,4_CHILDREN
0,Autonomous,cars,amod,NOUN,[]
1,cars,shift,nsubj,VERB,[Autonomous]
2,shift,shift,ROOT,VERB,"[cars, liability]"
3,insurance,liability,compound,NOUN,[]
4,liability,shift,dobj,VERB,"[insurance, toward]"
5,toward,liability,prep,NOUN,[manufacturers]
6,manufacturers,toward,pobj,ADP,[]


In [73]:
from spacy.symbols import nsubj, VERB

In [74]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")

# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)

{shift}


In [81]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


##### IOB Scheme  
    I – Token is inside an entity.  
    O – Token is outside an entity.  
    B – Token is the beginning of an entity.  

In [105]:
df = pd.DataFrame()


nlp = spacy.load("en_core_web_sm")
doc = nlp(u"San Francisco considers banning sidewalk delivery robots")

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
d = ents
df = df.append(d, ignore_index=True)
df.columns = ['0_TEXT', '1_start', '2_end', '3_Label']
# token level
df

Unnamed: 0,0_TEXT,1_start,2_end,3_Label
0,San Francisco,0,13,GPE


In [104]:
df = pd.DataFrame()

ent_san = {'0_TEXT':doc[0].text, '1_IOB':doc[0].ent_iob_, '2_Type':doc[0].ent_type_}
ent_francisco = {'0_TEXT':doc[1].text, '1_IOB':doc[1].ent_iob_, '2_Type':doc[1].ent_type_}
ent_considers = {'0_TEXT':doc[2].text, '1_IOB':doc[2].ent_iob_, '2_Type':doc[2].ent_type_}

df = df.append(ent_san, ignore_index=True)
df = df.append(ent_francisco, ignore_index=True)
df = df.append(ent_considers, ignore_index=True)

df

Unnamed: 0,0_TEXT,1_IOB,2_Type
0,San,B,GPE
1,Francisco,I,GPE
2,considers,O,
