In [1]:
import spacy.cli

In [2]:
# spacy.cli.download("en_core_web_sm")

In [3]:
import spacy

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
type(nlp)

spacy.lang.en.English

In [6]:
# When u call nlp on text, spacy first tokenizes the text to produce a doc obj
# The doc object is then processed in several different steps which is called a pipeline
# The pipeline used by default models consist of a tagger, a parser and an entity recognizer
# Each pipeline component returns the processed document, ehich is then passed on to the next component

In [7]:
# Text => NLP(Tokenizer -> (tagger, parser, ner..)) => Doc

In [8]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2545cb5dd80>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2545cb5e860>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2545ca885f0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2545c16e500>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2545cc19c00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2545ca88580>)]

In [9]:
doc = nlp("Apple is looking at buying U.K startup for $6 Millions")

In [10]:
for token in doc:
    print(token.text, token.pos_, token.dep_)
# Actual text
# Parts of speech
# Synctatic dependancy label

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K PROPN dobj
startup VERB dep
for ADP prep
$ SYM nmod
6 NUM compound
Millions NOUN pobj


In [11]:
spacy.explain("nsubj")

'nominal subject'

In [12]:
spacy.explain("dobj")

'direct object'

In [13]:
spacy.explain("PROPN")

'proper noun'

In [14]:
spacy.explain("pobj")

'object of preposition'

In [15]:
for token in doc:
    print(f"{token.text}, {spacy.explain(token.pos_)} = {token.pos_}, {spacy.explain(token.dep_)} = {token.dep_}")

Apple, proper noun = PROPN, nominal subject = nsubj
is, auxiliary = AUX, auxiliary = aux
looking, verb = VERB, root = ROOT
at, adposition = ADP, prepositional modifier = prep
buying, verb = VERB, complement of preposition = pcomp
U.K, proper noun = PROPN, direct object = dobj
startup, verb = VERB, unclassified dependent = dep
for, adposition = ADP, prepositional modifier = prep
$, symbol = SYM, modifier of nominal = nmod
6, numeral = NUM, compound = compound
Millions, noun = NOUN, object of preposition = pobj


In [16]:
type(doc)

spacy.tokens.doc.Doc

In [17]:
doc[0]

Apple

In [18]:
doc[0].pos_

'PROPN'

In [19]:
doc[4].pos_

'VERB'

In [20]:
doc[0].dep_

'nsubj'

In [21]:
# Some Attributes
#         1. Text
#         2. Lemmetization
#         3. Is alpha
#         4. Is stop word

In [22]:
doc[4].text

'buying'

In [23]:
doc[4].lemma_

'buy'

In [24]:
doc[4].is_alpha

True

In [25]:
doc[9].text

'6'

In [26]:
doc[9].lemma_

'6'

In [27]:
doc[9].is_alpha

False

In [28]:
doc[1].text

'is'

In [29]:
doc[1].is_stop

True

In [30]:
# Tokenization

In [31]:
text = "India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy."

In [32]:
doc = nlp(text)

In [33]:
for token in doc:
    print(token.text, end=" | ")

India | , | officially | the | Republic | of | India | , | is | a | country | in | South | Asia | . | It | is | the | seventh | - | largest | country | by | area | ; | the | most | populous | country | as | of | June | 2023 | ; | and | from | the | time | of | its | independence | in | 1947 | , | the | world | 's | most | populous | democracy | . | 

In [34]:
for entity in doc.ents:
    print(f"{entity.text} - {entity.label_} - {spacy.explain(entity.label_)}")

India - GPE - Countries, cities, states
the Republic of India - GPE - Countries, cities, states
South Asia - LOC - Non-GPE locations, mountain ranges, bodies of water
seventh - ORDINAL - "first", "second", etc.
June 2023 - DATE - Absolute or relative dates or periods
1947 - DATE - Absolute or relative dates or periods


In [35]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

In [36]:
doc

India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy.

In [37]:
from spacy import displacy
displacy.render(doc, style="dep", jupyter=True)

In [38]:
# Noun Chunks

In [39]:
# Only gets nouns
doc2 = nlp("Red cars do not carry higher insurance rates")
for chunk in doc2.noun_chunks:
    print(chunk.text)

Red cars
higher insurance rates


In [40]:
# Stemming

In [41]:
#               Porter Stemmer

In [42]:
import nltk

In [43]:
from nltk.stem.porter import PorterStemmer

In [44]:
pstem = PorterStemmer()

In [45]:
words = ["run", "runner", "running", "runs", "easily", "fairly", "consolingly"]
for word in words:
    print(f"{word} -------> {pstem.stem(word)}")

run -------> run
runner -------> runner
running -------> run
runs -------> run
easily -------> easili
fairly -------> fairli
consolingly -------> consolingli


In [46]:
# Snowball Stemmer better than PorterStemmer

In [47]:
from nltk.stem.snowball import SnowballStemmer
snow_stemmer = SnowballStemmer(language="english")

In [48]:
words = ["run", "runner", "running", "runs", "easily", "fairly", "consolingly"]
for word in words:
    print(f"{word} -------> {snow_stemmer.stem(word)}")

run -------> run
runner -------> runner
running -------> run
runs -------> run
easily -------> easili
fairly -------> fair
consolingly -------> consol


In [49]:
text = """Java is an object-oriented programming language. Everything in Java is associated with classes and objects, along with its attributes and methods. For example: in real life, a car is an object. The car has attributes, such as weight and color, and methods, such as drive and brake."""

In [50]:
for word in text.split():
    print(word + "-------------------------->" + pstem.stem(word))

Java-------------------------->java
is-------------------------->is
an-------------------------->an
object-oriented-------------------------->object-ori
programming-------------------------->program
language.-------------------------->language.
Everything-------------------------->everyth
in-------------------------->in
Java-------------------------->java
is-------------------------->is
associated-------------------------->associ
with-------------------------->with
classes-------------------------->class
and-------------------------->and
objects,-------------------------->objects,
along-------------------------->along
with-------------------------->with
its-------------------------->it
attributes-------------------------->attribut
and-------------------------->and
methods.-------------------------->methods.
For-------------------------->for
example:-------------------------->example:
in-------------------------->in
real-------------------------->real
life,-------------------------->lif

In [51]:
for word in text.split():
    print(word + "-------------------------->" + snow_stemmer.stem(word))

Java-------------------------->java
is-------------------------->is
an-------------------------->an
object-oriented-------------------------->object-ori
programming-------------------------->program
language.-------------------------->language.
Everything-------------------------->everyth
in-------------------------->in
Java-------------------------->java
is-------------------------->is
associated-------------------------->associ
with-------------------------->with
classes-------------------------->class
and-------------------------->and
objects,-------------------------->objects,
along-------------------------->along
with-------------------------->with
its-------------------------->it
attributes-------------------------->attribut
and-------------------------->and
methods.-------------------------->methods.
For-------------------------->for
example:-------------------------->example:
in-------------------------->in
real-------------------------->real
life,-------------------------->lif

In [52]:
doc = nlp("The cats are playing with the mice in the garden")
for token in doc:
    print(f"{token}\t{token.pos_}\t{token.lemma_}")

The	DET	the
cats	NOUN	cat
are	AUX	be
playing	VERB	play
with	ADP	with
the	DET	the
mice	NOUN	mouse
in	ADP	in
the	DET	the
garden	NOUN	garden


In [53]:
# Stop words

In [54]:
stop_words = nlp.Defaults.stop_words
print(stop_words)

{'most', 'hundred', 'while', 'enough', 'anything', 'is', 'although', 'between', 'cannot', 'which', 'but', 'also', 'back', 'my', 'amount', '’ll', '’s', 'various', 'our', 'became', 'hers', 'out', 'fifteen', 'formerly', 'onto', 'and', 'a', 'others', '‘re', 'ours', 'only', 'full', 'both', 'during', 'few', 'empty', 'before', 'make', "'m", 'there', 'they', 'his', '‘ll', 'namely', 'might', 'what', 'already', "'ve", 'has', '‘d', "n't", 'nobody', 'again', 'five', 'besides', 'top', 'whereafter', 'per', 'indeed', 'these', 'give', 'bottom', 'here', 'everywhere', 'n’t', 'must', 'ten', 'first', 'anyhow', 'now', 'front', 'be', 'otherwise', 'three', 'twelve', 'nine', 'about', 'would', 'every', 'thereby', 'wherever', '‘ve', 'further', 'along', 'nowhere', 'please', 'third', 'same', 'or', 'such', 'thence', 'whose', 'from', 'ourselves', 'everyone', 'beside', 'everything', 'against', 'too', 'when', 'though', 'via', 'quite', 'sixty', 'except', 'hence', 'are', 'seems', 'other', 'hereafter', 'do', 'down', 'wh

In [55]:
len(stop_words)

326

In [56]:
stop_words = list(stop_words)

In [57]:
type(stop_words)

list

In [58]:
print(sorted(stop_words))

["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'both', 'bottom', 'but', 'by', 'ca', 'call', 'can', 'cannot', 'could', 'did', 'do', 'does', 'doing', 'done', 'down', 'due', 'during', 'each', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'have', 'he', 'hence', 'her', 'here', 'he

In [59]:
nlp.vocab["first"].is_stop

True

In [60]:
# Adding a stop word
nlp.Defaults.stop_words.add("apex")
nlp.vocab["apex"].is_stop = False

In [61]:
stop_words = nlp.Defaults.stop_words
print(len(stop_words))

327


In [62]:
nlp.vocab["apex"].is_stop

False

In [63]:
nlp.Defaults.stop_words.remove("apex")

In [64]:
stop_words = nlp.Defaults.stop_words
print(len(stop_words))

326


In [65]:
# Download stop_words

In [66]:
import nltk

In [67]:
# nltk.download("stopwords")

In [68]:
from nltk.corpus import stopwords

In [69]:
# Stop words in scikit-learn

In [70]:
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS

In [71]:
stop_words

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [72]:
len(stop_words)

318

In [73]:
print(sorted(list(stop_words)))

['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give

In [75]:
# Rule based matching

# Through this method we can retrieve strings from texts based on the patterns

In [82]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")

text = "The quick brown fox jumps over the lazy dog. The dog barks loudly."

doc = nlp(text)

matcher = Matcher(doc.vocab)

pattern = [{"LOWER": "brown"}, {"LOWER": "fox"}]

matcher.add("FoxPattern", [pattern])

matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start: end]
    print(f"Match Found: {matched_span.text}")

Match Found: brown fox


In [83]:
# Creating patterns

In [84]:
doc = nlp("The quick-brown-fox jumps over the lazy dog. The quick brown fox eats will. \
           the quickbrownfox is dead. The dog misses the quick brownfox and quick--brown--fox.")

In [85]:
p1 = [{'LOWER': 'quickbrownfox'}]
p2 = [{'LOWER': 'quick'}, {'IS_PUNCT': True}, {'LOWER': 'brown'}, {'IS_PUNCT': True}, {'LOWER': 'fox'}]
p3 = [{'LOWER': 'quick'}, {'LOWER': 'brown'}, {'LOWER': 'fox'}]
p4 = [{'LOWER': 'quick'}, {'LOWER': 'brownfox'}]

In [86]:
matcher.add('QBF', [p1, p2, p3, p4])

In [87]:
founded_matches = matcher(doc)
print(founded_matches)

[(12825528024649263697, 1, 6), (12825528024649263697, 13, 16), (16017301484306868055, 14, 16), (12825528024649263697, 21, 22), (12825528024649263697, 29, 31), (12825528024649263697, 32, 37)]


In [88]:
for match_id, start, end in founded_matches:
    matched_span = doc[start: end]
    print(f"Match Found: {matched_span.text}")

Match Found: quick-brown-fox
Match Found: quick brown fox
Match Found: brown fox
Match Found: quickbrownfox
Match Found: quick brownfox
Match Found: quick--brown--fox
