# spaCy is an open-source software library for advanced natural language processing, written in the programming languages Python and Cython. 
## It uses object-oriented approach rather than using list of strings like nltk


In [19]:
#  Example sentance

sent ="Hey This is me . i am looking for you since past year . where are you now a days? how's your brother ? i am helping you .he steped out from camping . i seen many bridges . nothing can be done . rahul is honest boy . america has newyork state . elon is richest among all and he owns tesla and spacex . he studied from stanford university"


In [20]:
import spacy

nlp = spacy.load('en_core_web_sm')    # python -m spacy download en_core_web_sm       # small model for english 

#  also en-core-web-md and  en-core-web-lg available , but it has larger size and requires more time to load

In [None]:
doc  = nlp(sent)       # doc will be subscriptable not iterable like list 

## Similarity 

In [60]:
doc1 = nlp("he is brave")
doc2 =nlp("he is smart")
doc3 =nlp("usa is country")

print("similarity between doc1 and doc2 is ",doc1.similarity(doc2))
print("similarity between doc1 and doc3 is ",doc1.similarity(doc3))

similarity between doc1 and doc2 is  0.9302943431426128
similarity between doc1 and doc3 is  0.5579051030573283


  print("similarity between doc1 and doc2 is ",doc1.similarity(doc2))
  print("similarity between doc1 and doc3 is ",doc1.similarity(doc3))


## Sentence Tokenizer

In [9]:
for sent in doc.sents:
    print(sent)
    

Hey This is me .
i am looking for you since past year .
where are you now a days?
how's your brother ?
i am helping you .he
steped out from camping .
i seen many bridges .
nothing can be done .
rahul is honest boy .
america has newyork state .
elon is richest among all and he owns tesla and spacex .
he studied from stanford university


## Word Tokenizer

In [5]:
for token in doc:
    if token.i > 5:
        break
    print(token.text , token.i)    # token text , token index

Hey 0
This 1
is 2
me 3
. 4
i 5


## POS Tagging  (part Of Speech)

In [14]:
for token in doc:
    print( token.i , " - ", token.text , " - ", token.pos_)    # token text , token index
    if token.i > 5:
        break

0  -  Hey  -  INTJ
1  -  This  -  PRON
2  -  is  -  AUX
3  -  me  -  PRON
4  -  .  -  PUNCT
5  -  i  -  PRON
6  -  am  -  AUX


ADJ: adjective, e.g. big, old, green, incomprehensible, first

ADP: adposition, e.g. in, to, during

ADV: adverb, e.g. very, tomorrow, down, where, there

AUX: auxiliary, e.g. is, has (done), will (do), should (do)

CONJ: conjunction, e.g. and, or, but

CCONJ: coordinating conjunction, e.g. and, or, but

DET: determiner, e.g. a, an, the

INTJ: interjection, e.g. psst, ouch, bravo, hello

NOUN: noun, e.g. girl, cat, tree, air, beauty

NUM: numeral, e.g. 1, 2017, one, seventy-seven, IV, MMXIV

PART: particle, e.g. ’s, not,

PRON: pronoun, e.g I, you, he, she, myself, themselves, somebody

PROPN: proper noun, e.g. Mary, John, London, NATO, HBO

PUNCT: punctuation, e.g. ., (, ), ?

SCONJ: subordinating conjunction, e.g. if, while, that

SYM: symbol, e.g. $, %, §, ©, +, −, ×, ÷, =, :), 😝

VERB: verb, e.g. run, runs, running, eat, ate, eating

X: other, e.g. sfpksdpsxmsa

SPACE: space, e.g.

## Name Entity Recognition

In [58]:
for token in doc.ents:
    print( token.text ," - ", token.label_ )  
   

June 28, 1971  -  DATE
The Boring Company  -  ORG
Neuralink  -  ORG
US$221.4 billion  -  MONEY
July 2022,[5  -  DATE
Forbes  -  ORG
Pretoria  -  GPE
South Africa  -  GPE
the University of Pretoria  -  ORG
Canada  -  GPE
age 17  -  DATE
Canadian  -  NORP
Two years later  -  DATE
Queen's University  -  ORG
the University of Pennsylvania  -  ORG
Economics and Physics  -  ORG
California  -  GPE
1995  -  DATE
Stanford University  -  ORG
Kimbal  -  PERSON
Compaq  -  ORG
$307 million  -  MONEY
1999  -  DATE
The same year  -  DATE
Musk  -  PERSON
X.com  -  PRODUCT
2000  -  DATE
PayPal  -  ORG
eBay  -  ORG
2002  -  DATE
$1.5 billion  -  MONEY
2002  -  DATE
Musk  -  PERSON
Chief Engineer  -  PERSON
2004  -  DATE
Tesla Motors, Inc.  -  ORG
Tesla, Inc.  -  ORG
2008  -  DATE
2006  -  DATE
Tesla  -  ORG
Tesla Energy  -  ORG
2015  -  DATE
AI  -  ORG
2016  -  DATE
Neuralink  -  ORG
The Boring Company  -  ORG
American  -  NORP
Twitter  -  PRODUCT
2022  -  DATE
$44 billion  -  MONEY
Twitter  -  PRODUCT


## Lemmitization

In [43]:
for token in doc:
    if token.i > 20:
        break
    print(token.text , " ---> ", token.lemma_)   

Hey  --->  hey
This  --->  this
is  --->  be
me  --->  I
.  --->  .
i  --->  I
am  --->  be
looking  --->  look
for  --->  for
you  --->  you
since  --->  since
past  --->  past
year  --->  year
.  --->  .
where  --->  where
are  --->  be
you  --->  you
now  --->  now
a  --->  a
days  --->  day
?  --->  ?


##  visualise the dependency parse 

In [45]:
from spacy import displacy 

displacy.render(doc, style='dep' ,jupyter=True)

##  Word Vector

## Pipeline
##### using direct core model can take a long time to train so to train only onn those features that are needed for the model we use pipeline

### following can be added to the pipeline
#### AttributeRuler
#### DependencyParser
#### EditTreeLemmatizer
#### EntityLinker
#### EntityRecognizer
#### EntityRuler
#### Lemmatizer
#### Morphologizer
#### SentenceRecognizer
#### Sentencizer
#### SpanCategorizer
#### SpanRuler
#### Tagger
#### TextCategorizer
#### Tok2Vec
#### Tokenizer
#### TrainablePipe
#### Transformer
#### Other Functions

In [72]:
#  creating a new model

nlp1 = spacy.blank('en')

nlp1.add_pipe('sentencizer')


# analyzing model

nlp1.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att

In [70]:
d1 =nlp1("Hey This is me . i am looking for you since past year . where are you now a days? how's your brother ? i am helping you .he steped out from camping . i seen many bridges . nothing can be done")

d1.sents

<generator at 0x19f8646f860>

In [64]:
#  importing from file

with open ("text.txt" , "r" , encoding="utf8") as f:
    text = f.read()
print(text[:100])

Elon Reeve Musk FRS (/ˈiːlɒn/ EE-lon; born June 28, 1971) is a business magnate and investor. He is 


In [57]:
doc = nlp(text)

In [14]:
print(len(doc))
print(len(text))


9815
52888


In [16]:
for token in doc[:10]:
    print(token)   

Elon
Reeve
Musk
FRS
(
/ˈiːlɒn/
EE
-
lon
;


In [17]:
for token in text[:10]:
    print(token)   

E
l
o
n
 
R
e
e
v
e


In [40]:
l = list(doc.sents)

for sent in l:
    print(sent)
    # print("left part : ", sent.left_edge.text)
    for token in sent:
        print(token," :" , token.ent_type_ , " - ", token.ent_type )
        print("lemma form : ", token.lemma_)

    print("-----------------------------------------------------")

Hey This is me .
Hey  :   -  0
lemma form :  hey
This  :   -  0
lemma form :  this
is  :   -  0
lemma form :  be
me  :   -  0
lemma form :  I
.  :   -  0
lemma form :  .
-----------------------------------------------------
i am looking for you since past year .
i  :   -  0
lemma form :  I
am  :   -  0
lemma form :  be
looking  :   -  0
lemma form :  look
for  :   -  0
lemma form :  for
you  :   -  0
lemma form :  you
since  :   -  0
lemma form :  since
past  : DATE  -  391
lemma form :  past
year  : DATE  -  391
lemma form :  year
.  :   -  0
lemma form :  .
-----------------------------------------------------
where are you now a days?
where  :   -  0
lemma form :  where
are  :   -  0
lemma form :  be
you  :   -  0
lemma form :  you
now  :   -  0
lemma form :  now
a  :   -  0
lemma form :  a
days  :   -  0
lemma form :  day
?  :   -  0
lemma form :  ?
-----------------------------------------------------
how's your brother ?
how  :   -  0
lemma form :  how
's  :   -  0
lemma form :  

In [48]:
displacy.render(l[0], style='dep' ,jupyter=True)

In [1]:
#  using matcher

from spacy.matcher import Matcher




In [10]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [11]:
match = Matcher(nlp.vocab)

pattern = [{'LIKE_EMAIL': True}]

match.add("MY_PaTtErN", [pattern])

In [14]:
doc = nlp("yeh this is my mail rabu@hf.ui and meet me at 12:00 pm at www.google.com with vbrabu@hf.ui")
m = match(doc)

In [15]:
print(m)

[(8404999238479332955, 5, 6), (8404999238479332955, 15, 16)]


In [16]:
for i in m:
    print(doc[i[1]:i[2]])

rabu@hf.ui
vbrabu@hf.ui
