In [1]:
!pip install -U spacy

Collecting spacy
  Downloading spacy-3.2.3-cp39-cp39-win_amd64.whl (11.3 MB)
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.2.1
    Uninstalling spacy-3.2.1:
      Successfully uninstalled spacy-3.2.1
Successfully installed spacy-3.2.3


In [2]:
!pip install -U spacy-lookups-data

Collecting spacy-lookups-data
  Downloading spacy_lookups_data-1.0.3-py2.py3-none-any.whl (98.5 MB)
Installing collected packages: spacy-lookups-data
Successfully installed spacy-lookups-data-1.0.3


In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


2022-03-06 18:16:36.458452: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-03-06 18:16:36.458797: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# TOKENIZATION

Tokenization is the task of splitting a text into meanongful segments called tokens. The input to the tokenizer is a unicode text and the output is a Doc object

In [5]:
import spacy

In [6]:
nlp= spacy.load('en_core_web_sm')

In [7]:
doc= nlp("Apple is looking at buying U.K. startup for $1 billion")

In [8]:
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


# Part of Speech (POS) Tagging

In [9]:
doc

Apple is looking at buying U.K. startup for $1 billion

In [11]:
for token in doc:
    print(token.text, token.lemma_)

Apple Apple
is be
looking look
at at
buying buy
U.K. U.K.
startup startup
for for
$ $
1 1
billion billion


In [14]:
for token in doc:
    print(f'{token.text:{15}} {token.lemma_:{15}} {token.pos_:{10}} {token.is_stop}')

Apple           Apple           PROPN      False
is              be              AUX        True
looking         look            VERB       False
at              at              ADP        True
buying          buy             VERB       False
U.K.            U.K.            PROPN      False
startup         startup         VERB       False
for             for             ADP        True
$               $               SYM        False
1               1               NUM        False
billion         billion         NUM        False


# Dependency Parsing

In [15]:
for chunk in doc.noun_chunks:
    print(f'{chunk.text:{30}}  {chunk.root.text:{15}}  {chunk.root.dep_}')

Apple                           Apple            nsubj
U.K.                            U.K.             dobj


# Name Entity Recognization

In [16]:
doc

Apple is looking at buying U.K. startup for $1 billion

In [18]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


# Sentence Segmentation

In [19]:
doc

Apple is looking at buying U.K. startup for $1 billion

In [20]:
for sent in doc.sents:
    print(sent)

Apple is looking at buying U.K. startup for $1 billion


In [21]:
doc1= nlp("Welcome to AB Group. Thanks for watching. Please like and subscribe")

In [22]:
for sent in doc1.sents:
    print(sent)

Welcome to AB Group.
Thanks for watching.
Please like and subscribe


In [23]:
doc1= nlp("Welcome to.*.KGP Talkies.*.Thanks for watching")

In [24]:
for sent in doc1.sents:
    print(sent)

Welcome to.*.KGP Talkies.*.Thanks for watching


### Custom Function for Sentence Segmentation

In [25]:
def set_rule(doc):
    for token in doc[:-1]:
        if token.text == ".*.":
            doc[token.i+1].is_sent_start=True
    return doc

In [27]:
nlp.add_pipe(set_rule, before='parser')

# SEE RIGHT CODE FROM DOCUMENTATION

ValueError: [E966] `nlp.add_pipe` now takes the string name of the registered component factory, not a callable component. Expected string, but got <function set_rule at 0x000002360731C820> (name: 'None').

- If you created your component with `nlp.create_pipe('name')`: remove nlp.create_pipe and call `nlp.add_pipe('name')` instead.

- If you passed in a component like `TextCategorizer()`: call `nlp.add_pipe` with the string name instead, e.g. `nlp.add_pipe('textcat')`.

- If you're using a custom component: Add the decorator `@Language.component` (for function components) or `@Language.factory` (for class components / factories) to your custom component and assign it a name, e.g. `@Language.component('your_name')`. You can then run `nlp.add_pipe('your_name')` to add it to the pipeline.

# VISUALIZATION

In [28]:
from spacy import displacy

In [29]:
doc

Apple is looking at buying U.K. startup for $1 billion

In [30]:
displacy.render(doc, style='dep')

In [31]:
# Compact Visualization

In [32]:
displacy.render(doc, style='dep', options={'compact':True})

In [33]:
displacy.render(doc, style='ent')