## Lab 3: Natural Language Processing

- Spacy documentation: https://spacy.io/usage/spacy-101
- Another library is NLTK (we don't cover in this class): https://www.nltk.org/

In [1]:
import pandas as pd
import spacy #conda install spacy
             #python -m spacy download en_core_web_sm
             #pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

In [36]:
# Source: https://www.nytimes.com/live/2020/10/14/world/covid-coronavirus
text = """Sixteen states each added more not new cases in the seven-day period ending Monday than they had in any \
other weeklong stretch of the pandemic. North Dakota and South Dakota are reporting more new cases per person \
than any state has previously. And in Wisconsin, home to 10 of the country’s 20 metro areas with the highest \
rates of recent cases, crews are preparing a field hospital at the state fairgrounds."""

# Parse text through the `nlp` model
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

### Sentences

In [37]:
sents = list(doc.sents)
for i, sent in enumerate(sents):
    print(i+1, sent)

1 Sixteen states each added more not new cases in the seven-day period ending Monday than they had in any other weeklong stretch of the pandemic.
2 North Dakota and South Dakota are reporting more new cases per person than any state has previously.
3 And in Wisconsin, home to 10 of the country’s 20 metro areas with the highest rates of recent cases, crews are preparing a field hospital at the state fairgrounds.


### Tokens
i.e. a word, punctuation symbol, whitespace, etc.

In [38]:
print ([token.text for token in sents[0]])

['Sixteen', 'states', 'each', 'added', 'more', 'not', 'new', 'cases', 'in', 'the', 'seven', '-', 'day', 'period', 'ending', 'Monday', 'than', 'they', 'had', 'in', 'any', 'other', 'weeklong', 'stretch', 'of', 'the', 'pandemic', '.']


Lemmatization is the method of converting a token to it’s root/base form.

For example: 'played' and 'playing' have the same root: 'play'.

In [39]:
word = doc[5]

print('Original word: {}'.format(doc[3]))

print('Lemmatized word: {}'.format(word.lemma_))

# Attributes of a word
print('\nWord: {} \nPart of speech: {} \nAll alphabet letters? {}\nIs punctuation? {}\nIs stop word? {}\nDependency: {}'\
      .format(word.text, word.pos_, word.is_alpha, word.is_punct, word.is_stop, word.dep_))

Original word: added
Lemmatized word: not

Word: not 
Part of speech: PART 
All alphabet letters? True
Is punctuation? False
Is stop word? True
Dependency: neg


### Entities

In [6]:
entities = [(entity.label_, entity.text) for entity in doc.ents]
print(entities)

[('CARDINAL', 'Sixteen'), ('DATE', 'seven-day'), ('DATE', 'Monday'), ('GPE', 'North Dakota'), ('GPE', 'South Dakota'), ('GPE', 'Wisconsin'), ('CARDINAL', '10'), ('QUANTITY', '20 metro')]


### Cleaning data
One common step in data cleaning is getting rid of stop words. Stop words are words in the English language that don't necessarily add a lot in terms of meaning - e.g., 'and', 'or', 'by', etc. Spacy and NLTK have build in lists of stop words.

In [7]:
word.is_stop

False

In [8]:
# Removing stop words & punctuation
clean_tokens = [t for t in doc if (t.is_stop == False) and (t.is_punct == False)]
print(clean_tokens)

[Sixteen, states, added, new, cases, seven, day, period, ending, Monday, weeklong, stretch, pandemic, North, Dakota, South, Dakota, reporting, new, cases, person, state, previously, Wisconsin, home, 10, country, 20, metro, areas, highest, rates, recent, cases, crews, preparing, field, hospital, state, fairgrounds]


## Preprocessing: Amazon Example
Download dataset from here: https://www.kaggle.com/sid321axn/amazon-alexa-reviews

In [9]:
df_amazon = pd.read_csv("Data/amazon_alexa.tsv", sep="\t")
df_amazon.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [10]:
# example from: https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/

import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()
punctuations = string.punctuation

def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens



In [11]:
tokens = {}
for i in range(len(df_amazon)):
    tokens[i] = spacy_tokenizer(df_amazon.verified_reviews[i])

In [12]:
tokens

{0: ['love', 'echo'],
 1: ['loved'],
 2: ['playing',
  'game',
  'answer',
  'question',
  'correctly',
  'alexa',
  'says',
  'got',
  'wrong',
  'answers',
  'like',
  'able',
  'turn',
  'lights',
  'away',
  'home'],
 3: ['lot',
  'fun',
  'thing',
  '4',
  'yr',
  'old',
  'learns',
  'dinosaurs',
  'control',
  'lights',
  'play',
  'games',
  'like',
  'categories',
  'nice',
  'sound',
  'playing',
  'music'],
 4: ['music'],
 5: ['received',
  'echo',
  'gift',
  'needed',
  'bluetooth',
  'play',
  'music',
  'easily',
  'accessible',
  'found',
  'smart',
  'speaker',
  'wait'],
 6: ['having',
  'cellphone',
  'use',
  'features',
  'ipad',
  'use',
  'great',
  'alarm',
  'u',
  'r',
  'deaf',
  'hear',
  'alarm',
  'bedroom',
  'living',
  'room',
  'reason',
  'fun',
  'ask',
  'random',
  'questions',
  'hear',
  'response',
  'smartbon',
  'politics'],
 7: ['think',
  '5th',
  'purchased',
  'working',
  'getting',
  'room',
  'house',
  'like',
  'features',
  'offer',


## After pre-processing, you can analyze the resulting data...
## Some Applications of NLP:
- Sentiment Analysis
- Document Similarity
- Text analysis (e.g, common issues in reviews/surveys, frequently tweeted topics, etc.)

In [14]:
from negspacy.negation import Negex

ModuleNotFoundError: No module named 'negspacy'

In [15]:
pip install negspacy

Collecting negspacy
  Downloading negspacy-0.1.8.tar.gz (9.8 kB)
Building wheels for collected packages: negspacy
  Building wheel for negspacy (setup.py) ... [?25ldone
[?25h  Created wheel for negspacy: filename=negspacy-0.1.8-py3-none-any.whl size=8842 sha256=59a23bcb52f96dbefc1c53d1a8ce3da51a1d22b21129d2afc1846c53630e5d1b
  Stored in directory: /Users/ezhang/Library/Caches/pip/wheels/a9/ef/52/d5274e443e446f41084fb7436bffe4f78b36a5bf637c5e4bd4
Successfully built negspacy
Installing collected packages: negspacy
Successfully installed negspacy-0.1.8
Note: you may need to restart the kernel to use updated packages.
