In [None]:
##############################################################################
## Fundamentals for pratical Text Analytics - document/word embeddings
##                                            more spacy, 
##                                            gensim to build our own ----> spacy
##                                            why:  dense numeric representations to capture meaning
##                                                  use downstream -> similarity, clustering, ML
##
##
## Learning goals:
##                 - continue spacy
##                 - foundational understanding of word vectors via Word2Vec
##                 - can roll our own vectors
##                 - generalized, pre-trained word vectors for S|UML tasks (intent classification)
##
##
## Great resources
##                 - https://spacy.io/usage/spacy-101
##                 - https://spacy.io/universe/category/courses
##
##
##############################################################################

In [None]:
# installs
! pip install -U spacy 
! pip install -U textacy
! pip install newspaper3k
! pip install afinn
! pip install whatlies

In [None]:
# imports
import spacy
from spacy import cli
from spacy import displacy

import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, cdist, squareform
import seaborn as sns


# upcoming!
# from textacy.extract.keyterms import textrank
import gensim

import textacy

from newspaper import Article
import json

from afinn import Afinn


In [None]:
# use spacy cli
MODEL = "en_core_web_md"

# https://spacy.io/models/en
cli.download(MODEL)

# nlp = language model
nlp = spacy.load(MODEL)

In [None]:
############################################ WARMUP
##
##
## Scrape the Wikipedia Entry for the Netflix series Squid Game
## https://en.wikipedia.org/wiki/Squid_Game
##
## use the pre-loaded spacy NER model to parse the entities
## parse the entities into a dataframe
## create a barplot summarizing count by entity type
## TRICKY:  make it a horizontal barplot with the most frequent entity at the top of the chart
## 

In [None]:
############################################ Word/Doc Vectors
##
##

In [None]:
# lets review a simple document


In [None]:
# spacy is a pipeline


In [None]:
# remember, we can slice up docs into tokens/spans


In [None]:
# tokens have all sorts of attributes that were learned


In [None]:
# worth noting, a document can be comprised of sentences
# remember that we used to download punkt from nltk?
# that was a sentence tokenizer (divide into sentences)
# spacy has this built in


In [None]:
####################################### quick departure
# above we checked against spacy's learned vocabulary
# this is a check to see if a token is out-of-vocabulary (OOV)

# model reviews:
# https://spacy.io/usage/models


In [None]:
# this is just for reference
# 

# v = nlp.vocab.vectors
# v.shape

In [None]:
# spacy stores text and tokens effeciently 
# https://spacy.io/usage/spacy-101#vocab

# spacy provides a way to lookup the vectors
# nlp.vocab.strings["golf"]

# and we can reverse
# nlp.vocab.strings[18149141486079540445]

## spacy is very flexible, and while vectors can be added and edited, 
## we can just include our own, which we will do later

In [None]:
#######################################  Vectors/Embeddings
##
## You have heard me use this term quite a bit
## we have seen this via PCA ----> take a large feature space and re-represent this in a new space
##     the goal was to encode information and reduce noise, right?
##
## we saw this in Tsne (2 embeddings) and UMAP (can be 2 or more depending on our needs)
## 
## 
## Well in text, we have the same idea
## we could always use the tools above, but there this is a "hot" field right now -> embeddings




![](https://miro.medium.com/max/2224/0*K5a1Ws_nsbEjhbYk.png)

> Above we can see words can be represented in these highly dimensional spaces.  The aim is to encapsulate context.  Remember bag-of-words removes sequence/order!


![](https://jalammar.github.io/images/word2vec/king-analogy-viz.png)

![](https://miro.medium.com/max/1400/1*cuOmGT7NevP9oJFJfVpRKA.png)


In [None]:
# get some vectors

# king = nlp("king").vector
# man = nlp("man").vector
# woman = nlp("woman").vector
# queen = nlp("queen").vector
# jester = nlp("jester").vector
# court = nlp("court").vector
# golf = nlp("golf").vector

In [None]:
# what do we have


In [None]:
# a quick preview


In [None]:
# some math


In [None]:
# the comparison set

# lookups = np.stack([queen, jester, court, golf])
# lookups.shape

In [None]:
# we can use cdist manually
# comp_list = ['queen', 'jester', 'court', 'golf']

# the test calc needs to be 2-d



In [None]:
# the comps -- cdist expected test to be 2d, so we used expand dims above


In [None]:
# lets plot the distances as a barplot



In [None]:
# so what did we see
# it was a small test, but conceptually we saw how these vectors can be compared
# we used distance to compare the numeric vectors and manually find the most similar
# contrived example, but sets up the concepts

In [None]:
# there are some great toolkits that have been created
# to breakdown and explore all sorts of embeddings

# lots you can do here, but from some of the dev rel folks at rasa, which
# we will briefly see next class!
# https://github.com/RasaHQ/whatlies

In [None]:
## let's go back to the tweet


In [None]:
# what do we get for a token that is OOV?

# nlp('🔥').vector




In [None]:
## what do you notice?
## this is a convention of spacy
## for OOV tokens, spacy doesn't fail, it simply returns an array of zeroes

In [None]:
# lets review a doc vector


In [None]:
# how do we think the doc vector is constructed?
# vecs = []
# for token in doc:
#   vecs.append(token.vector)

# make it a numpy array
# va = np.array(vecs)

# vam = va.mean(axis=0)
# vam.shape

# compare
# np.all(vam == dv)

In [None]:
# remember spans?
# same still applies - token vectors are averaged over the docs and spans to represent the document

In [None]:
#######################################  YOUR TURN
## 
## scrape the text from the three articles from the URLs below
## generate the doc vectors
## what is the most similar article to URL1 via cosine distance?
#
# URL1 = "https://www.boston.com/weather/weather/2021/11/26/boston-ma-snow-forecast-friday-nov-26-2021/"
# URL2 = "https://www.sportingnews.com/us/fantasy/news/nfl-week-12-weather-updates-lack-of-rain-wind-snow-in-forecast-eases-fantasy-football-start-em-sit-em-decisions/lueadzpkttxa1l2nzyr70hr18"
# URL3 = "https://www.marketwatch.com/story/the-u-s-stock-market-suffers-ugly-black-friday-selloff-here-are-the-biggest-losers-and-winners-11637952898"
## 


In [None]:
#######################################  UP NEXT
## 
## Team data challenge!
# 