# SpaCY

Make sure to install SpaCy and to load the english model: https://spacy.io/usage

### Segmenting a string into sentences

In [2]:
import spacy
# Load English Model
nlp = spacy.load('en_core_web_sm')
text = 'This is a sentence. This is another sentence'
# Run SPaCy pipeline
sp_text = nlp(text)
# Segment into sentences
for sentence in sp_text.sents:
   print("\n",sentence)


 This is a sentence.

 This is another sentence


In [3]:
for tok in sp_text:
    print(tok,tok.idx)

This 0
is 5
a 8
sentence 10
. 18
This 20
is 25
another 28
sentence 36


**Printing Spacy output to a file**

The `__repr__()` method is used to represent a class object (here: sentence) as a string

In [4]:
f = open("out.txt", "w")
sentences = [sentence.__repr__() for sentence in sp_text.sents]
print("SENTENCES\n\n","\n".join(sentences), file=f)

### Segmenting a string into tokens (Tokenizing)

In [5]:
import spacy
# Load English Model
nlp = spacy.load('en_core_web_sm')
text = "Twenty-two years after the original Jurassic Park failed, the new park, also known as Jurassic World, is open for business."
# Run SpaCy pipeline
sp_text = nlp(text)
# Get tokens
for tok in sp_text:
   print(tok.text)

Twenty
-
two
years
after
the
original
Jurassic
Park
failed
,
the
new
park
,
also
known
as
Jurassic
World
,
is
open
for
business
.


In [6]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer

In [7]:
tokenizer(text)

Twenty-two years after the original Jurassic Park failed, the new park, also known as Jurassic World, is open for business.

### Detecting stop words

In [8]:
import spacy
# Load English Model
nlp = spacy.load('en_core_web_sm')
text = "Twenty-two years after the original Jurassic Park failed, the new park, also known as Jurassic World, is open for business."
# Run SpaCy pipeline
sp_text = nlp(text)
for token in sp_text:
   if token.is_stop == True:
    print(token)

two
after
the
the
also
as
is
for


### POS Tagging

In [9]:
import spacy
# Load the SpaCy model for English
nlp = spacy.load('en_core_web_sm')

# Define test sentence
sentence = "Amélie is a story about a girl named Amélie whose childhood was suppressed by her Father's mistaken concerns of a heart defect."

# Apply the SpaCy model to the text
nlp_sentence = nlp(sentence)

# For each token in the text, retrieve this token its tag and its part of speech
spacy_pos_tagged = [(token, token.tag_, token.pos_) for token in nlp_sentence]
spacy_pos_tagged

[(Amélie, 'NNP', 'PROPN'),
 (is, 'VBZ', 'VERB'),
 (a, 'DT', 'DET'),
 (story, 'NN', 'NOUN'),
 (about, 'IN', 'ADP'),
 (a, 'DT', 'DET'),
 (girl, 'NN', 'NOUN'),
 (named, 'VBN', 'VERB'),
 (Amélie, 'NNP', 'PROPN'),
 (whose, 'WP$', 'ADJ'),
 (childhood, 'NN', 'NOUN'),
 (was, 'VBD', 'VERB'),
 (suppressed, 'VBN', 'VERB'),
 (by, 'IN', 'ADP'),
 (her, 'PRP$', 'ADJ'),
 (Father, 'NNP', 'PROPN'),
 ('s, 'POS', 'PART'),
 (mistaken, 'JJ', 'ADJ'),
 (concerns, 'NNS', 'NOUN'),
 (of, 'IN', 'ADP'),
 (a, 'DT', 'DET'),
 (heart, 'NN', 'NOUN'),
 (defect, 'NN', 'NOUN'),
 (., '.', 'PUNCT')]

### Lemmatizing

In [10]:
import spacy

# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])
word_form = "suppressed"
# Apply the SpaCy  model to the input
# The result is a list
token = nlp(word_form)

# Extract the lemma for the first token in the list returned by the previous statement
token[0].lemma_

'suppress'

### Dependency Parsing

In [11]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple bought U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.head)

Apple apple PROPN NNP nsubj bought
bought buy VERB VBD ROOT bought
U.K. u.k. PROPN NNP compound startup
startup startup VERB VBG dobj bought
for for ADP IN prep bought
$ $ SYM $ quantmod billion
1 1 NUM CD compound billion
billion billion NUM CD pobj for


### Named Entity Recognition

In [12]:
import spacy
from spacy import displacy
from pprint import pprint
nlp = spacy.load('en_core_web_sm', entity=True)
doc = nlp("Amélie is a story about a girl named Amélie whose childhood was suppressed by her Father's mistaken concerns of a heart defect.")


print("NEs:", [ne for ne in doc.ents])
displacy.render(doc, style='ent', jupyter=True)

NEs: [Amélie, Amélie]
