<a href="https://colab.research.google.com/github/Chaliantosh/datascience_cheatsheets/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Tokenization Basics**

In [2]:
s1 = 'Apple is looking to buying U.K. startup for $1 billion !'
s2 = 'Hello all, we are here to help you! email support@udemy.com or visit us at http://www.udemy.com!'
s3 = '10km cab ride almost costs $20 in NYC'
s4 = "Let's watch movie together."

In [3]:
import spacy

In [4]:
#pretrained model in spacy sm indicates small
nlp = spacy.load(name='en_core_web_sm')

In [5]:
!python -m spacy download en_core_web_md

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
[K     |████████████████████████████████| 33.5 MB 1.3 MB/s 
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [6]:
import en_core_web_md

In [7]:
#pretrained model in spacy md indicates medium
nlp_1 = spacy.load(name='en_core_web_md')

In [8]:
doc1 = nlp(s1)
print(s1)
for token in doc1:
  print(token)

Apple is looking to buying U.K. startup for $1 billion !
Apple
is
looking
to
buying
U.K.
startup
for
$
1
billion
!


In [9]:
doc2 = nlp(s2)
print(s2)
for token in doc2:
  print(token)

Hello all, we are here to help you! email support@udemy.com or visit us at http://www.udemy.com!
Hello
all
,
we
are
here
to
help
you
!
email
support@udemy.com
or
visit
us
at
http://www.udemy.com
!


In [10]:
doc3 = nlp(s3)
print(s3)
for token in doc3:
  print(token)

10km cab ride almost costs $20 in NYC
10
km
cab
ride
almost
costs
$
20
in
NYC


In [11]:
doc4 = nlp(s4)
print(s4)
for token in doc4:
  print(token)

Let's watch movie together.
Let
's
watch
movie
together
.


In [12]:
type(doc4)

spacy.tokens.doc.Doc

In [13]:
len(doc4)

6

In [14]:
doc4[2]

watch

In [15]:
doc4[2:4]

watch movie

**Stemming**

In [16]:
words = ['run', 'runner', 'running', 'ran', 'runs', 'easily', 'fairly']

In [17]:
import nltk

In [18]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [19]:
p_stemmer = PorterStemmer()
s_stemmer = SnowballStemmer(language='english')

In [20]:
for word in words:
  print(word + ' ------ ' +p_stemmer.stem(word))

run ------ run
runner ------ runner
running ------ run
ran ------ ran
runs ------ run
easily ------ easili
fairly ------ fairli


In [21]:
for word in words:
  print(word + ' ------ ' +s_stemmer.stem(word))

run ------ run
runner ------ runner
running ------ run
ran ------ ran
runs ------ run
easily ------ easili
fairly ------ fair


**Lemmatization**

In [22]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [23]:
doc1 = nlp("The striped bats are hanging on their feet for best")

In [24]:
for token in doc1:
  print(token.text, '\t', token.lemma_)

The 	 the
striped 	 stripe
bats 	 bat
are 	 be
hanging 	 hang
on 	 on
their 	 their
feet 	 foot
for 	 for
best 	 good


In [25]:
#compare the above result with potter stemmer
s1 = "The striped bats are hanging on their feet for best"
for word in s1.split():
  print(word + ' ------ ' +p_stemmer.stem(word))

The ------ the
striped ------ stripe
bats ------ bat
are ------ are
hanging ------ hang
on ------ on
their ------ their
feet ------ feet
for ------ for
best ------ best


**Stopwords**

In [26]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [27]:
print(nlp.Defaults.stop_words)

{'only', 'this', 'namely', 'whence', 'twelve', 'would', 'within', 'along', 'me', 'must', 'is', 'and', 'more', 'noone', 'twenty', 'many', 'thereupon', 'else', 'thereafter', 'us', "n't", 'once', 'via', 'to', 'being', 'over', 're', 'own', '‘s', 'get', 'how', 'using', '‘ll', 'during', 'put', 'among', '’d', 'last', 'just', 'became', 'therein', 'themselves', 'by', 'elsewhere', 'whereupon', 'eight', 'had', 'keep', 'becoming', 'anyhow', 'regarding', 'bottom', 'may', 'toward', 'another', 'are', 'can', 'anywhere', 'after', 'nevertheless', 'of', 'serious', 'around', 'thus', 'seem', 'fifty', 'been', 'wherever', 'up', 'name', 'they', 'sometime', 'part', 'really', 'full', 'done', 'whenever', 'down', 'even', 'always', 'few', '‘re', 'nobody', 'forty', 'ten', 'very', 'your', 'because', 'whereafter', 'four', 'besides', 'ourselves', 'through', 'cannot', 'wherein', 'afterwards', "'ve", 'third', 'if', 'perhaps', 'least', 'whereas', 'anything', 'do', 'take', 'thereby', 'since', 'ever', 'everything', 'hundre

In [28]:
len(nlp.Defaults.stop_words)

326

In [29]:
#to check whether a word is stopword or not?
nlp.vocab['always'].is_stop

True

In [30]:
nlp.vocab['finance'].is_stop

False

In [31]:
nlp.vocab['asdf'].is_stop

False

In [32]:
#to add a word to stopword list
nlp.Defaults.stop_words.add('asdf')
nlp.vocab['asdf'].is_stop = True
nlp.vocab['asdf'].is_stop

True

In [33]:
#len will be 327 since 'asdf' has been added to stopwords list
len(nlp.Defaults.stop_words)

327

In [34]:
print(nlp.Defaults.stop_words)

{'only', 'this', 'namely', 'whence', 'twelve', 'would', 'within', 'along', 'me', 'must', 'is', 'and', 'more', 'noone', 'twenty', 'many', 'thereupon', 'else', 'thereafter', 'us', "n't", 'once', 'via', 'to', 'being', 'over', 're', 'own', '‘s', 'get', 'how', 'using', '‘ll', 'during', 'put', 'among', '’d', 'last', 'just', 'became', 'therein', 'themselves', 'by', 'elsewhere', 'whereupon', 'eight', 'had', 'keep', 'becoming', 'anyhow', 'regarding', 'bottom', 'may', 'toward', 'another', 'are', 'can', 'anywhere', 'after', 'nevertheless', 'of', 'serious', 'around', 'thus', 'seem', 'fifty', 'been', 'wherever', 'up', 'name', 'they', 'sometime', 'part', 'really', 'full', 'done', 'whenever', 'down', 'even', 'always', 'few', '‘re', 'nobody', 'forty', 'ten', 'very', 'your', 'because', 'whereafter', 'four', 'besides', 'ourselves', 'through', 'cannot', 'wherein', 'afterwards', "'ve", 'third', 'if', 'perhaps', 'least', 'whereas', 'anything', 'do', 'take', 'thereby', 'since', 'ever', 'everything', 'hundre

In [35]:
#to remove a word from stopword list
nlp.Defaults.stop_words.remove('only')
nlp.vocab['only'].is_stop = False
nlp.vocab['only'].is_stop

False

**Rule Based Matching**

In [44]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [45]:
#Import the matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab) # created matcher object and pass nlp.vocab

In [46]:
#create patterns
pattern_1 = [{'LOWER': 'hello'}, {'LOWER': 'world'}]
pattern_2 = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]

In [48]:
#Add pattern to matcher object, rule consists of => An ID key, an on_match callback, one or more patterns
matcher.add('Hello World', [pattern_1, pattern_2])

In [50]:
#create a document
doc = nlp(" 'Hello World' are the first two printed words for most of the programmers, printing 'Hello-World' is most common for beginners")

In [52]:
#finding the matches
find_matches = matcher(doc)
print(find_matches)

[(8585552006568828647, 2, 4), (8585552006568828647, 19, 22)]


In [53]:
#define a function to find the matches
for match_id, start, end in find_matches:
  string_id = nlp.vocab.strings[match_id]
  span = doc[start:end]
  print(match_id, string_id, start, end, span.text)

8585552006568828647 Hello World 2 4 Hello World
8585552006568828647 Hello World 19 22 Hello-World


In [54]:
#remove the matches
matcher.remove('Hello World')

In [57]:
#Setting pattern options and quantifiers
#Redefine patterns
pattern_3 = [{'LOWER': 'hello'}, {'LOWER': 'world'}]
pattern_4 = [{'LOWER': 'hello'}, {'IS_PUNCT': True, 'OP': '*'}, {'LOWER': 'world'}]
#'OP': '*' ---> this is going to allow this pattern to match zero or more times for any punctuation
#Add a new set of patterns to matcher
matcher.add('Hello World', [pattern_3, pattern_4])
#create a document
doc_2 = nlp("You can print Hello World or hello world or Hello-World")
#finding the matches
find_matches = matcher(doc_2)
print(find_matches)

[(8585552006568828647, 3, 5), (8585552006568828647, 6, 8), (8585552006568828647, 9, 12)]


**Phrase Based Matching**

In [58]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [59]:
#Import the phrase matcher library
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [60]:
phrase_list = ["Barack Obama", "Angela Merkel", "Washington, D.C."]

In [61]:
#convert each phrase to a document object
phrase_patterns = [nlp(text) for text in phrase_list]

In [62]:
phrase_patterns

[Barack Obama, Angela Merkel, Washington, D.C.]

In [65]:
#these are spacy docs that is why we don't have any quotes here
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [66]:
#pass each doc object into the matcher
matcher.add("TerminologyList", None, *phrase_patterns)

In [67]:
doc_3 = nlp("German Chancellor Angela Merkel and US President Barack Obama "
             "converse in the Oval Office inside the White House in Washington, D.C.")

In [68]:
#finding the matches
find_matches = matcher(doc_3)
print(find_matches)

[(3766102292120407359, 2, 4), (3766102292120407359, 7, 9), (3766102292120407359, 19, 22)]


In [69]:
#define a function to find the matches
for match_id, start, end in find_matches:
  string_id = nlp.vocab.strings[match_id] # get string representation
  span = doc_3[start:end] # get the matched span
  print(match_id, string_id, start, end, span.text)

3766102292120407359 TerminologyList 2 4 Angela Merkel
3766102292120407359 TerminologyList 7 9 Barack Obama
3766102292120407359 TerminologyList 19 22 Washington, D.C.


**POS Tagging**

In [73]:
s1 = "Apple is looking at buying U.K. startup for $1 billion"

In [74]:
import spacy
nlp = spacy.load(name='en_core_web_sm')

In [75]:
doc = nlp(s1)

In [76]:
for token in doc:
  print(token.text, token.pos_, token.tag_, spacy.explain(token.tag_))

Apple PROPN NNP noun, proper singular
is AUX VBZ verb, 3rd person singular present
looking VERB VBG verb, gerund or present participle
at ADP IN conjunction, subordinating or preposition
buying VERB VBG verb, gerund or present participle
U.K. PROPN NNP noun, proper singular
startup NOUN NN noun, singular or mass
for ADP IN conjunction, subordinating or preposition
$ SYM $ symbol, currency
1 NUM CD cardinal number
billion NUM CD cardinal number


In [77]:
for key, val in doc.count_by(spacy.attrs.POS).items():
  print(key, doc.vocab[key].text, val)

96 PROPN 2
87 AUX 1
100 VERB 2
85 ADP 2
92 NOUN 1
99 SYM 1
93 NUM 2


In [78]:
from spacy import displacy

In [79]:
displacy.render(docs=doc, style='dep', options={'distance': 80}, jupyter=True)

**Named Entity Recognition**

In [80]:
s1 = "Apple is looking at buying U.K. startup for $1 billion"
s2 = "San Francisco considers banning sidewalk delivery robots"
s3 = "facebook is hiring a new vice president in U.S."

In [81]:
import spacy
nlp = spacy.load(name='en_core_web_sm')

In [93]:
doc1 = nlp(s1)
for ent in doc1.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

Apple ORG Companies, agencies, institutions, etc.
U.K. GPE Countries, cities, states
$1 billion MONEY Monetary values, including unit


In [84]:
doc2 = nlp(s2)
for ent in doc2.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

San Francisco GPE Countries, cities, states


In [85]:
doc3 = nlp(s3)
for ent in doc3.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))
  #faacebook is not classified as an entity

U.S. GPE Countries, cities, states


In [86]:
ORG = doc3.vocab.strings['ORG']

In [87]:
from spacy.tokens import Span
new_ent = Span(doc3, 0, 1, label = ORG)

In [88]:
doc3.ents = list(doc3.ents) + [new_ent]

In [89]:
doc3.ents

(facebook, U.S.)

In [90]:
for ent in doc3.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

facebook ORG Companies, agencies, institutions, etc.
U.S. GPE Countries, cities, states


In [91]:
from spacy import displacy

In [95]:
displacy.render(docs = doc1, style = 'ent', jupyter = True)

In [97]:
displacy.render(docs = doc1, style = 'ent', options={'ents': ['ORG']}, jupyter = True)

**Sentence Segmentation**

In [102]:
s1 = "This is a sentence. This is a second sentence. This is last sentence."
s2 = "This is a sentence; This is a second sentence; This is last sentence;"

In [103]:
import spacy
nlp = spacy.load(name='en_core_web_sm')

In [104]:
doc1 = nlp(s1)

In [105]:
for sent in doc1.sents:
  print(sent.text)

This is a sentence.
This is a second sentence.
This is last sentence.


In [106]:
s3 = "This is a sentence. This is a second U.K. sentence. This is last sentence."

In [109]:
doc3 = nlp(s3)
for sent in doc3.sents:
  print(sent.text)

This is a sentence.
This is a second U.K. sentence.
This is last sentence.


In [110]:
doc2 = nlp(s2)
for sent in doc2.sents:
  print(sent.text)
  #Spacy treats this entire sentence as a single sentence.

This is a sentence; This is a second sentence; This is last sentence;


In [120]:
def set_custom_boundaries(doc):
  for token in doc[:-1]:
    if token.text == ';':
      print(token.i)
      doc[token.i+1].is_sent_start = True
  return doc  

In [115]:
doc2 = nlp(s2)
for sent in doc2.sents:
  print(sent.text)
  #Spacy treats this entire sentence as a single sentence.

This is a sentence; This is a second sentence; This is last sentence;


In [121]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [123]:
nlp.add_pipe(set_custom_boundaries, before='attribute_ruler')

ValueError: ignored