# Finding Words, Phrases and concepts

In [None]:
%pip install spacy

spacy.blank() creates an object that contains processing pipeline.

nlp also includes language-specific rules used for tokenizing the text into words and punctuation. 

In [5]:
# Import spaCy
import spacy

# Create a blank English nlp object
nlp = spacy.blank("en")                     # de for German , es for Spanish

When you call nlp on a string, spaCy first tokenizes the text and creates a document object. 

In [6]:
# Process a text
doc = nlp("Sentence in English can be processed.")

# Documents,Spans and Tokens 

In [7]:
# Select the first token
first_token = doc[0]

# Print the first token's text
print(first_token.text)

Sentence


## Accessing Slice of document

You can access document token by token

i.e

If we write doc[2:5]
It will access from 2nd word till 4th word.5th word will not be included

In [10]:
#Slice Sentence in English
first_slice=doc[0:3]

print(first_slice)


#Slice English can be processed
second_slice=doc[2:6]

print(second_slice)

Sentence in English
English can be processed


## Token class has different attribute

Attributes of token can be accessed using dot operator
i.e if t is token then attributes can be accessed by t.attribute_name  

Following are some of the attribute

i &nbsp; &nbsp;&nbsp;                   index of token<br>
ent_type &nbsp;&nbsp;                Named Entity type(int)<br>
ent_type_  &nbsp;&nbsp;              Named Entity type(str)<br>
lower_ &nbsp;&nbsp;&nbsp;                  Lowercase of token<br>
is_alpha &nbsp;&nbsp;                Does the token consist of alphabetic characters?<br> 
is_digit &nbsp;&nbsp;                Does the token consist of digits?<br>
is_lower &nbsp;&nbsp;                Is the token in lowercase? <br>
is_upper &nbsp;&nbsp;                Is the token in uppercase?<br> 
is_title &nbsp;&nbsp;                Is the token in titlecase?<br>
is_punct &nbsp;&nbsp;                Is the token punctuation?<br>
is_space &nbsp;&nbsp;                Does the token consist of whitespace characters?<br> 
like_url &nbsp;&nbsp;                Does the token resemble a URL?<br>
like_num &nbsp;&nbsp;                Does the token represent a number?<br>
like_email &nbsp;&nbsp;              Does the token resemble an email address?<br>
pos_ &nbsp;&nbsp;&nbsp;                   Coarse-grained part-of-speech from the Universal POS tag set.<br>

To check list: https://spacy.io/api/token 

In [15]:
token=doc[2]

token.i

2

In [19]:
token.ent_type_

''

In [20]:
token.lower_

'english'

In [21]:
token.is_alpha

True

In [22]:
token.is_digit

False

In [23]:
token.is_lower

False

In [25]:
token.is_title

True

In [26]:
token.is_space

False

In [29]:
token.pos_

0

In [33]:
doc=nlp("This is url https://spacy.io/api/token")
token=doc[3]
token.like_url

True

In [34]:
doc=nlp("This is url mymail@gmail.com")
token=doc[3]
token.like_email

True

In [37]:
doc=nlp("This is url 44%")
token=doc[3]
token.like_num
#token.text

'44'

# Find Percentage from given Text

Task:<br>
   Iterate to access every token.<br>
   Check if each token represents number<br>
   If token represents number check if next token is % sign<br>

In [42]:
percent_list=[]
for token in doc:
    if token.like_num:
        next_token=doc[token.i+1]
        if next_token.text=="%":
            percent_list.append(token.text)
percent_list

['44']

# Trained Piplines

spaCy provides a number of trained pipeline packages you can download using the spacy download command.<br>
The spacy.load method loads a pipeline package by name and returns an nlp object.<br>
The package provides the binary weights that enable spaCy to make predictions.<br>
It also includes the vocabulary, meta information about the pipeline and the configuration file used to train it. <br>

Trained English Piplines:<br>
en_core_web_sm &nbsp;   small<br>
en_core_web_md &nbsp;   medium<br>
en_core_web_lg &nbsp;   large<br>

In [None]:
#Download large English Package 
#use en_core_web_sm for minimum time
!python -m spacy download en_core_web_sm   

# Predicting Part of speech 

Load Language Pipeline<br>
Process text<br>
Iterate to check each token<br>
Print token and predicted part of speech.<br>


In [None]:
#load pipeline
nlp=spacy.load("en_core_web_sm")

#process text
doc=nlp("I should ask my sister. She is really good at it.")

#Iterate over text
for token in doc:
    # predict part of speech
    """
    pos_    Part of speech
    dep_   Syntactic dependency relation.Predicted dependency label
    head   Syntactic head token. You can also think of it as the parent token this word is attached to.
    """
    print(token.text, token.pos_, token.dep_, token.head.text)

# Something Additional

In [None]:
# get list of DEP n Meaning
nlp = spacy.load("en_core_web_sm")
for label in nlp.get_pipe("parser").labels:
    print(label, " -- ", spacy.explain(label))

# get lsit of tags n Meaning
for label in nlp.get_pipe("tagger").labels:
    print(label, " -- ", spacy.explain(label))

<img src='syntatic-dependency.png' width="1200" height="400">

# Predicting Named Entities

Named entities are "real world objects" that are assigned a name – for example, a person, an organization or a country.<br>
We can get predicted named entities using ents attribute of doc.<br>
It returns an iterator of Span objects, so we can print the entity text and the entity label using the .label_ attribute<br>


In [60]:
doc=nlp("Shyam likes to play football")

for entities in doc.ents:
    print(entities.text,entities.label_)

Shyam PERSON


# Visualizing named entity

In [97]:
from spacy import displacy
text = "Aditi reads book everyday.She started reading Atomic habits on 5 Feb 2022. Book price is 79$"

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
displacy.render(doc, style="ent")

# Get quick definitions of the most common tags and labels.

spacy.explain method



In [98]:
spacy.explain("GPE")

'Countries, cities, states'

In [99]:
spacy.explain("dobj")

'direct object'

In [100]:
spacy.explain("NNP")

'noun, proper singular'

# Rule based Matching

1.import the matcher from spacy.matcher.<br>
2.Load a pipeline and create the nlp object.<br>
3.Initialize matcher with the shared vocabulary nlp.vocab <br>
4.The matcher.add method lets you add a pattern.<br>
 &nbsp;The first argument is a unique ID to identify which pattern was matched.<br>
 &nbsp;The second argument is a list of patterns.<br>

In [104]:
import spacy

# Import the Matcher
from spacy.matcher import Matcher

# Load a pipeline and create the nlp object
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "python"}]
matcher.add("Python_Programming", [pattern])

# Process some text
doc = nlp("I am using python for nlp.")

# Call the matcher on the doc
matches = matcher(doc)

In [105]:
"""
matches has 3 values
match_id: hash value of the pattern name
start: start index of matched span
end: end index of matched span
"""
matches

[(5497254226871022622, 3, 4)]

In [106]:
for match_id, start, end in matches:
    #get all matched span
    matched_span = doc[start:end]
    print(matched_span.text)

python


## Matching lexical attributes

In [107]:
#pattern for Smart Hackathon 2018
pattern = [
    
    {"LOWER": "smart"},
    {"LOWER": "hackathon"},
    {"IS_DIGIT": True}
  
]
#add pattern to matcher 
doc = nlp("Smart Hackathon 2020")
matcher.add("Hackathon", [pattern])
matches = matcher(doc)

#find match
for match_id, start, end in matches:
    #get all matched span
    matched_span = doc[start:end]
    print(matched_span.text)

# Using operators and quantifiers

Operators and quantifiers let you define how often a token should be matched.<br>
They can be added using the "OP" key.<br>

Operators:

An "!" negates the token, so it's matched 0 times.

A "?" makes the token optional, and matches it 0 or 1 times.

A "+" matches a token 1 or more times.

And finally, an "*" matches 0 or more times.

In [116]:
pattern = [
    {"LEMMA": "implement"},           # LEMMA indicates base word
    {"POS": "DET", "OP": "?"},        # ? is operator --> optional: match 0 or 1 times  for optional article(determiner)
    {"POS": "NOUN"}                   # part of speech noun
]

In [114]:
doc = nlp("I have implemented miniproject")
matcher.add("implement_pattern", [pattern])
matches = matcher(doc)

#find match
for match_id, start, end in matches:
    #get all matched span
    matched_span = doc[start:end]
    print(matched_span.text)

implemented miniproject


# Practice making Pattern

In [110]:
# pattern --> Adjective + Noun +(Optional) Noun

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

In [115]:
#pattern --> download(base word)  + Proper noun

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: downloaded Fortnite
Match found: downloading Minecraft
Match found: download Winzip
