# NLP with SpaCy

## Import SpaCy

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x200284c0b50>

In [3]:
# nlp encapsulates the entire nlp pipeline
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [4]:
# sample text for doc demo
# define text for demonstration
sample_text = "While running in Central Park, \nI noticed a discarded McDonald's container,surounded by buzzing flies was annoying."
print(sample_text)

While running in Central Park, 
I noticed a discarded McDonald's container,surounded by buzzing flies was annoying.


### Docs

In [5]:
# Create a doc with the nlp pipeline
doc = nlp(sample_text)
type(doc)

spacy.tokens.doc.Doc

In [7]:
# docs look like original text when displayed
print(sample_text)
doc

While running in Central Park, 
I noticed a discarded McDonald's container,surounded by buzzing flies was annoying.


While running in Central Park, 
I noticed a discarded McDonald's container,surounded by buzzing flies was annoying.

### Token objects within docs

In [8]:
for token in doc:
    print(token)

While
running
in
Central
Park
,


I
noticed
a
discarded
McDonald
's
container
,
surounded
by
buzzing
flies
was
annoying
.


## Token Attributes

In [10]:
# slice token from doc
token = doc[1]
token

running

### token.text: original form of the word

In [11]:
print(token.text)

running


### token.lemma_: the base or root form of the word

In [12]:
print(token.lemma_)

run


### token.pos_: The part-of-speech tag associated with the token

In [13]:
print(token.pos_)

VERB


###  token.is_stop: Boolean flag to check if the token is a stop word

In [14]:
print(token.is_stop)

False


### token.is_punct: Boolean flag to check if the token is punctuation

In [16]:
print(token.is_punct)

False


### loop through each token, create dict for each token, convert to df

In [17]:
import pandas as pd
# Create dictionary for desired attributes for each token
token_data = []
for token in doc:
    token_dict = {
        ".text": token.text,
        ".lemma_": token.lemma_,
        ".pos_": token.pos_,
        ".is_stop": token.is_stop,
        ".is_punct": token.is_punct,
        ".is_space": token.is_space
    }
    token_data.append(token_dict)
# Save dictionary as a dataframe
spacy_df = pd.DataFrame(token_data) 
spacy_df.head(10)

Unnamed: 0,.text,.lemma_,.pos_,.is_stop,.is_punct,.is_space
0,While,while,SCONJ,True,False,False
1,running,run,VERB,False,False,False
2,in,in,ADP,True,False,False
3,Central,Central,PROPN,False,False,False
4,Park,Park,PROPN,False,False,False
5,",",",",PUNCT,False,True,False
6,\n,\n,SPACE,False,False,True
7,I,I,PRON,True,False,False
8,noticed,notice,VERB,False,False,False
9,a,a,DET,True,False,False


## Preprocessing with SpaCy

### Remove stop words

In [19]:
# empty list to collect tokens after cleaning
cleaned_tokens = []

In [21]:
# loop to remove stop words
for token in doc:
    if token.is_stop == True:
        continue
    else:
        cleaned_tokens.append(token.text.lower())
print(cleaned_tokens)

['running', 'central', 'park', ',', '\n', 'noticed', 'discarded', 'mcdonald', 'container', ',', 'surounded', 'buzzing', 'flies', 'annoying', '.']


### Remove punctuation and whitespace

In [22]:
## Adding onto our preprocessing for loop
# For loop to remove stopwords & punctuation
cleaned_tokens = []
# For each token 
for token in doc:
    
    # If the token is a stopword,
    if token.is_stop == True:
        # skip it and move onto next token
        continue 
    ##NEW: 
    # if the token is punctuation,
    if token.is_punct == True:
        # skip it and move onto next token
        continue
    # if the token is a whitespace  (spaces, new lines, etc)
    if token.is_space == True:
        # skip it and move onto next token
        continue
    
    # Otherwise,
    else: 
        # keep the tokens'.text for the final list of tokens
        cleaned_tokens.append(token.text.lower())
        
print(cleaned_tokens)

['running', 'central', 'park', 'noticed', 'discarded', 'mcdonald', 'container', 'surounded', 'buzzing', 'flies', 'annoying']


### obtain lemmas

In [23]:
## Adding onto our preprocessing for loop
# For loop to remove stopwords & punctuation
cleaned_lemmas = []
# For each token 
for token in doc:
    
    # If the token is a stopword,
    if token.is_stop == True:
        # skip it and move onto next token
        continue 
    
    # if the token is punctuation,
    if token.is_punct == True:
        # skip it and move onto next token
        continue
    # if the token is a whitespace  (spaces, new lines, etc)
    if token.is_space == True:
        # skip it and move onto next token
        continue
    
    # Otherwise,
    else: 
        # # keep the tokens'.text for the final list of tokens
        # cleaned_tokens.append(token.text.lower())
        # keep the tokens's .lemma_ for the final list of tokens
        cleaned_lemmas.append(token.lemma_.lower())
        
print(cleaned_lemmas)

['run', 'central', 'park', 'notice', 'discard', 'mcdonald', 'container', 'surounde', 'buzz', 'fly', 'annoying']


### compare cleaned tokens vs. cleaned lemmas

In [24]:
# Compare text and lemmas
print("Tokenized words:\n", cleaned_tokens,"\n")
print("Lemmatized words:\n", cleaned_lemmas)

Tokenized words:
 ['running', 'central', 'park', 'noticed', 'discarded', 'mcdonald', 'container', 'surounded', 'buzzing', 'flies', 'annoying'] 

Lemmatized words:
 ['run', 'central', 'park', 'notice', 'discard', 'mcdonald', 'container', 'surounde', 'buzz', 'fly', 'annoying']


## Define function for preprocessing with SpaCy

In [26]:
# define function

def preprocess_doc(doc, remove_stopwords=True, remove_punct=True, use_lemmas=False):
    """Temporary Fucntion - for Education Purposes (we will make something better below)
    """
    tokens = [ ]
    for token in doc:
        # Check if should remove stopwords and if token is stopword
        if (remove_stopwords == True) and (token.is_stop == True):
            # Continue the loop with the next token
            continue
    
        # Check if should remove stopwords and if token is stopword
        if (remove_punct == True) and (token.is_punct == True):
            continue
    
        # Check if should remove stopwords and if token is stopword
        if (remove_punct == True) and (token.is_space == True):
            continue
    
        ## Determine final form of output list of tokens/lemmas
        if use_lemmas:
            tokens.append(token.lemma_.lower())
        else:
            tokens.append(token.text.lower())
    return tokens

In [29]:
# run function with false args

# Convert the text to a doc.
doc = nlp(sample_text)
# Tokenizing, keeping stopwords and punctuatin
dirty_tokens = preprocess_doc(doc, remove_stopwords=False,remove_punct=False)
print(dirty_tokens)

['while', 'running', 'in', 'central', 'park', ',', '\n', 'i', 'noticed', 'a', 'discarded', 'mcdonald', "'s", 'container', ',', 'surounded', 'by', 'buzzing', 'flies', 'was', 'annoying', '.']


In [28]:
# run function with true args
# Tokenizing, removing stopwords and punctuation
cleaned_tokens = preprocess_doc(doc, remove_stopwords=True,remove_punct=True)
print(cleaned_tokens)

['running', 'central', 'park', 'noticed', 'discarded', 'mcdonald', 'container', 'surounded', 'buzzing', 'flies', 'annoying']
