In [56]:
import pandas as pd
import numpy as np

import spacy
import nltk
from nltk.stem import PorterStemmer
from spacy.tokens import Span

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from spacy import displacy

In [3]:
#Using blank pipeline

nlp= spacy.blank("en")
doc= nlp("Captain america ate 100$ of samosa. Then he said i can do this all day")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
i
can
do
this
all
day


In [4]:
nlp.pipe_names

[]

In [5]:
#Using pre-trained pipeline
nlp= spacy.load("en_core_web_sm")

In [6]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [7]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x273e333a750>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x273e3339c10>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x273e332dd90>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x273e366bfd0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x273e32258d0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x273e332db60>)]

In [8]:
doc= nlp("Captain america ate 100$ of samosa. Then he said i can do this all day.")

for token in doc:
    print(token, " | ", token.pos_, " | ", token.lemma_)

Captain  |  PROPN  |  Captain
america  |  PROPN  |  america
ate  |  VERB  |  eat
100  |  NUM  |  100
$  |  NUM  |  $
of  |  ADP  |  of
samosa  |  PROPN  |  samosa
.  |  PUNCT  |  .
Then  |  ADV  |  then
he  |  PRON  |  he
said  |  VERB  |  say
i  |  PRON  |  I
can  |  AUX  |  can
do  |  VERB  |  do
this  |  PRON  |  this
all  |  DET  |  all
day  |  NOUN  |  day
.  |  PUNCT  |  .


In [9]:
doc= nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent)

Tesla Inc
$45 billion


In [10]:
doc= nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent.text, " | ",ent.label_)

Tesla Inc  |  ORG
$45 billion  |  MONEY


In [11]:
doc= nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [12]:
from spacy import displacy
displacy.render(doc, style="ent")

# Addin only NER to a blank pipeline

In [13]:
source_nlp= spacy.load("en_core_web_sm")
nlp = spacy.blank("en")

#Add ner component from the english trained pipeline
nlp.add_pipe("ner", source= source_nlp)
nlp.pipe_names

['ner']

# Stemming and Lemmatization

In [14]:
stemmer= PorterStemmer()

In [15]:
words= ["eating", "eats", "eat", "adjustable", "rafting", "ability", "meeting"]

for word in words:
    print(word, " | ", stemmer.stem(word))

eating  |  eat
eats  |  eat
eat  |  eat
adjustable  |  adjust
rafting  |  raft
ability  |  abil
meeting  |  meet


In [16]:
words= ["eating", "eats", "eat", "adjustable", "rafting", "ability", "meeting"]
" ".join(words)

'eating eats eat adjustable rafting ability meeting'

In [17]:
nlp= spacy.load("en_core_web_sm")
doc= nlp("eating eats eat adjustable rafting ability meeting better")

for token in doc:
    print(token, " | ", token.lemma_)

eating  |  eat
eats  |  eat
eat  |  eat
adjustable  |  adjustable
rafting  |  raft
ability  |  ability
meeting  |  meeting
better  |  well


In [18]:
nlp= spacy.load("en_core_web_sm")
doc= nlp("eating eats eat adjustable rafting ability meeting better")

for token in doc:
    print(token, " | ", token.lemma_, " | ", token.lemma)

eating  |  eat  |  9837207709914848172
eats  |  eat  |  9837207709914848172
eat  |  eat  |  9837207709914848172
adjustable  |  adjustable  |  6033511944150694480
rafting  |  raft  |  7154368781129989833
ability  |  ability  |  11565809527369121409
meeting  |  meeting  |  14798207169164081740
better  |  well  |  4525988469032889948


In [19]:
doc= nlp("Mando talked for 3 hours although talking is not his thing he became talkative")

for token in doc:
    print(token, " | ", token.lemma_, " | ", token.lemma)

Mando  |  Mando  |  7837215228004622142
talked  |  talk  |  13939146775466599234
for  |  for  |  16037325823156266367
3  |  3  |  602994839685422785
hours  |  hour  |  9748623380567160636
although  |  although  |  343236316598008647
talking  |  talk  |  13939146775466599234
is  |  be  |  10382539506755952630
not  |  not  |  447765159362469301
his  |  his  |  2661093235354845946
thing  |  thing  |  2473243759842082748
he  |  he  |  1655312771067108281
became  |  become  |  12558846041070486771
talkative  |  talkative  |  13364764166055324990


# POS Tagging

In [20]:
doc= nlp("Elon flew to mars yesterday. He carried biryani masala with him.")

for token in doc:
    print(token, " | ", token.pos_, " | ", token.pos, " | ", spacy.explain(token.pos_))

Elon  |  PROPN  |  96  |  proper noun
flew  |  VERB  |  100  |  verb
to  |  ADP  |  85  |  adposition
mars  |  NOUN  |  92  |  noun
yesterday  |  NOUN  |  92  |  noun
.  |  PUNCT  |  97  |  punctuation
He  |  PRON  |  95  |  pronoun
carried  |  VERB  |  100  |  verb
biryani  |  ADJ  |  84  |  adjective
masala  |  NOUN  |  92  |  noun
with  |  ADP  |  85  |  adposition
him  |  PRON  |  95  |  pronoun
.  |  PUNCT  |  97  |  punctuation


In [21]:
doc= nlp("Wow! Dr. Strange made 265 million $ on the very first day.")

for token in doc:
    print(token, " | ", token.pos_, " | ", spacy.explain(token.pos_))

Wow  |  INTJ  |  interjection
!  |  PUNCT  |  punctuation
Dr.  |  PROPN  |  proper noun
Strange  |  PROPN  |  proper noun
made  |  VERB  |  verb
265  |  NUM  |  numeral
million  |  NUM  |  numeral
$  |  NUM  |  numeral
on  |  ADP  |  adposition
the  |  DET  |  determiner
very  |  ADV  |  adverb
first  |  ADJ  |  adjective
day  |  NOUN  |  noun
.  |  PUNCT  |  punctuation


In [22]:
doc= nlp("Wow! Dr. Strange made 265 million $ on the very first day.")

for token in doc:
    print(token, " | ", token.pos_, " | ", token.tag_, " | " ,spacy.explain(token.tag_))

Wow  |  INTJ  |  UH  |  interjection
!  |  PUNCT  |  .  |  punctuation mark, sentence closer
Dr.  |  PROPN  |  NNP  |  noun, proper singular
Strange  |  PROPN  |  NNP  |  noun, proper singular
made  |  VERB  |  VBD  |  verb, past tense
265  |  NUM  |  CD  |  cardinal number
million  |  NUM  |  CD  |  cardinal number
$  |  NUM  |  CD  |  cardinal number
on  |  ADP  |  IN  |  conjunction, subordinating or preposition
the  |  DET  |  DT  |  determiner
very  |  ADV  |  RB  |  adverb
first  |  ADJ  |  JJ  |  adjective (English), other noun-modifier (Chinese)
day  |  NOUN  |  NN  |  noun, singular or mass
.  |  PUNCT  |  .  |  punctuation mark, sentence closer


In [23]:
doc= nlp("He quit the job")

print(doc[1].text, " | ", doc[1].tag_, " | ", spacy.explain(doc[1].tag_) )

quit  |  VBD  |  verb, past tense


In [24]:
earnings_text= """Microsoft Corp. today announced the following results for the quarter ended September 30, 2023, as compared to the corresponding period of last fiscal year:

·        Revenue was $56.5 billion and increased 13% (up 12% in constant currency)

·        Operating income was $26.9 billion and increased 25% (up 24% in constant currency)

·        Net income was $22.3 billion and increased 27% (up 26% in constant currency)

·        Diluted earnings per share was $2.99 and increased 27% (up 26% in constant currency)

"With copilots, we are making the age of AI real for people and businesses everywhere," said Satya Nadella, 
chairman and chief executive officer of Microsoft. "We are rapidly infusing AI across every layer of the tech stack and for every role and 
business process to drive productivity gains for our customers".
"""

In [25]:
#Extracting only SPACE", "X", "PUNCT"
doc= nlp(earnings_text)

for token in doc:
    if token.pos_ in ["SPACE", "X", "PUNCT"]:
        print(token, " | ", token.pos_, " | ", token.tag_, " | " ,spacy.explain(token.pos_))

,  |  PUNCT  |  ,  |  punctuation
,  |  PUNCT  |  ,  |  punctuation
:  |  PUNCT  |  :  |  punctuation


  |  SPACE  |  _SP  |  space
·  |  PUNCT  |  NFP  |  punctuation
         |  SPACE  |  _SP  |  space
(  |  PUNCT  |  -LRB-  |  punctuation
)  |  PUNCT  |  -RRB-  |  punctuation


  |  SPACE  |  _SP  |  space
·  |  PUNCT  |  NFP  |  punctuation
         |  SPACE  |  _SP  |  space
(  |  PUNCT  |  -LRB-  |  punctuation
)  |  PUNCT  |  -RRB-  |  punctuation


  |  SPACE  |  _SP  |  space
·  |  PUNCT  |  NFP  |  punctuation
         |  SPACE  |  _SP  |  space
(  |  PUNCT  |  -LRB-  |  punctuation
)  |  PUNCT  |  -RRB-  |  punctuation


  |  SPACE  |  _SP  |  space
·  |  PUNCT  |  NFP  |  punctuation
         |  SPACE  |  _SP  |  space
(  |  PUNCT  |  -LRB-  |  punctuation
)  |  PUNCT  |  -RRB-  |  punctuation


  |  SPACE  |  _SP  |  space
"  |  PUNCT  |  ``  |  punctuation
,  |  PUNCT  |  ,  |  punctuation
,  |  PUNCT  |  ,  |  punctuation
"  |  PUNCT  |  ''  |  punctuation
,  |  PUNCT  

In [26]:
#Removing punctuations and irrelevance
doc= nlp(earnings_text)

for token in doc:
    if token.pos_ not in ["SPACE", "X", "PUNCT"]:
        print(token, " | ", token.pos_, " | ", token.tag_, " | " ,spacy.explain(token.pos_))

Microsoft  |  PROPN  |  NNP  |  proper noun
Corp.  |  PROPN  |  NNP  |  proper noun
today  |  NOUN  |  NN  |  noun
announced  |  VERB  |  VBD  |  verb
the  |  DET  |  DT  |  determiner
following  |  VERB  |  VBG  |  verb
results  |  NOUN  |  NNS  |  noun
for  |  ADP  |  IN  |  adposition
the  |  DET  |  DT  |  determiner
quarter  |  NOUN  |  NN  |  noun
ended  |  VERB  |  VBD  |  verb
September  |  PROPN  |  NNP  |  proper noun
30  |  NUM  |  CD  |  numeral
2023  |  NUM  |  CD  |  numeral
as  |  SCONJ  |  IN  |  subordinating conjunction
compared  |  VERB  |  VBN  |  verb
to  |  ADP  |  IN  |  adposition
the  |  DET  |  DT  |  determiner
corresponding  |  ADJ  |  JJ  |  adjective
period  |  NOUN  |  NN  |  noun
of  |  ADP  |  IN  |  adposition
last  |  ADJ  |  JJ  |  adjective
fiscal  |  ADJ  |  JJ  |  adjective
year  |  NOUN  |  NN  |  noun
Revenue  |  NOUN  |  NN  |  noun
was  |  AUX  |  VBD  |  auxiliary
$  |  SYM  |  $  |  symbol
56.5  |  NUM  |  CD  |  numeral
billion  |  NUM  |  

In [27]:
doc= nlp(earnings_text)
filtered_text= []

for token in doc:
    if token.pos_ not in ["SPACE", "X", "PUNCT"]:
        filtered_text.append(token)

In [28]:
filtered_text[:10]

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter]

In [29]:
count= doc.count_by(spacy.attrs.POS)
count

{96: 8,
 92: 37,
 100: 14,
 90: 7,
 85: 16,
 93: 17,
 97: 24,
 98: 1,
 84: 11,
 103: 12,
 87: 6,
 99: 4,
 89: 8,
 86: 6,
 95: 3,
 94: 1}

In [30]:
doc.vocab[96].text

'PROPN'

In [31]:
for k, v in count.items():
    print(doc.vocab[k].text, " | ", v)

PROPN  |  8
NOUN  |  37
VERB  |  14
DET  |  7
ADP  |  16
NUM  |  17
PUNCT  |  24
SCONJ  |  1
ADJ  |  11
SPACE  |  12
AUX  |  6
SYM  |  4
CCONJ  |  8
ADV  |  6
PRON  |  3
PART  |  1


# Name Entity Recognition (NER)

In [32]:
nlp= spacy.load("en_core_web_sm")
doc= nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [33]:
displacy.render(doc, style="ent")

In [34]:
nlp.pipe_labels["ner"]

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [35]:
doc= nlp("Micheal Bloomberg founded Bloomberg Inc in 1982")

for ent in doc.ents:
    print(ent.text,  " | ", ent.label_,  " | ", spacy.explain(ent.label_))

Micheal Bloomberg  |  PERSON  |  People, including fictional
Bloomberg Inc  |  ORG  |  Companies, agencies, institutions, etc.
1982  |  DATE  |  Absolute or relative dates or periods


In [36]:
#Hugging face name entity recognition

# SPAN

In [37]:
from spacy.tokens import Span

In [38]:
doc= nlp("Tesla Inc is going to acquire twitter for $45 billion")

s1= Span(doc, 0, 1, label="ORG")
s2= Span(doc, 6, 7, label="ORG")

doc.set_ents([s1, s2], default="unmodified")

In [39]:
for ent in doc.ents:
    print(ent.text,  " | ", ent.label_)

Tesla Inc  |  ORG
twitter  |  ORG
$45 billion  |  MONEY


# BAG OF WORDS

In [40]:
v= CountVectorizer()
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 5, 'hathodawala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [41]:
v= CountVectorizer(ngram_range=(1,3))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [42]:
corpus=[
    "Wunmi ate rice and beans.",
    "Funke is beautiful and smart.",
    "Nigerians are friendly and loves eating jollof rice."
]

In [43]:
#Function to remove stopwords, punctuation and lemmatize
nlp= spacy.load("en_core_web_sm")

def preprocess(text):
    doc= nlp(text)
    filtered_tokens= []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [44]:
#Applying the function to a text
preprocess("Nigerians are friendly and loves eating jollof rice.")

'Nigerians friendly love eat jollof rice'

In [45]:
processed_corpus= [preprocess(text) for text in corpus]
processed_corpus

['Wunmi eat rice bean',
 'Funke beautiful smart',
 'Nigerians friendly love eat jollof rice']

# BAG OF NGRAM 

In [46]:
v= CountVectorizer(ngram_range=(1,2))
v.fit(processed_corpus)
v.vocabulary_

{'wunmi': 19,
 'eat': 3,
 'rice': 16,
 'bean': 0,
 'wunmi eat': 20,
 'eat rice': 5,
 'rice bean': 17,
 'funke': 8,
 'beautiful': 1,
 'smart': 18,
 'funke beautiful': 9,
 'beautiful smart': 2,
 'nigerians': 14,
 'friendly': 6,
 'love': 12,
 'jollof': 10,
 'nigerians friendly': 15,
 'friendly love': 7,
 'love eat': 13,
 'eat jollof': 4,
 'jollof rice': 11}

In [47]:
v.get_feature_names_out()

array(['bean', 'beautiful', 'beautiful smart', 'eat', 'eat jollof',
       'eat rice', 'friendly', 'friendly love', 'funke',
       'funke beautiful', 'jollof', 'jollof rice', 'love', 'love eat',
       'nigerians', 'nigerians friendly', 'rice', 'rice bean', 'smart',
       'wunmi', 'wunmi eat'], dtype=object)

In [48]:
v.transform(["Wunmi ate rice and beans."]).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0]],
      dtype=int64)

In [49]:
v.transform(processed_corpus).toarray()

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1],
       [0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]],
      dtype=int64)

In [50]:
doc= nlp("Wunmi ate rice and beans. Funke is beautiful and smart. \
         Nigerians are friendly and loves eatting jollof rice.")

for token in doc:
    print(token, "|", token.lemma_, "|", token.lemma)

Wunmi | Wunmi | 17622258296457287139
ate | eat | 9837207709914848172
rice | rice | 5999186075793416517
and | and | 2283656566040971221
beans | bean | 4990996348219001211
. | . | 12646065887601541794
Funke | Funke | 14706415696644233707
is | be | 10382539506755952630
beautiful | beautiful | 530855179026533974
and | and | 2283656566040971221
smart | smart | 2191600787884973499
. | . | 12646065887601541794
          |           | 7263008856362461915
Nigerians | Nigerians | 2558192113150403045
are | be | 10382539506755952630
friendly | friendly | 12034322203066787430
and | and | 2283656566040971221
loves | love | 3702023516439754181
eatting | eatte | 17271744586191902617
jollof | jollof | 2559203094161453931
rice | rice | 5999186075793416517
. | . | 12646065887601541794


# TFIDF VECTORIZER

In [60]:
text= ["""Incredible Gifts India Wooden Happy Birthday Unique Personalized Gift (5 X 4 Inch) Size:4 x 5   
Made Of Natural Imported Wood, Which Is Quite Solid With Light Particle Pattern & Is Soft Pale To Blond Colour. 
Your Uploaded Photo Will Look Amazing And Beautiful After Laser Engraving On It. This Is One Of The Most Popular Unique Gifts In Our Store. 
We Offer This In Multiple Sizes, 
Some Can Be Used As Table Top And The Big Sizes Can Be Used As Wall Hanging Which Just Blends With Your Home Decaration. 
You Just Need To Upload A Picture And Add Your Own Text And We Will Do The Rest For You. 
We Will Email You The Preview Before Making The Final Product. 
Do You Want The Best Moment Of Your Life To Be Engraved On A Wooden Plaque That Lasts For A Longer Time And Stays Close To You Forever? 
Then You Are At The Right Place. 
We Present To You Various Sizes Personalized Engraved Wooden Plaques Made With Birch Wood. 
Let Your Memories Be Engraved On Wooden Plaques And Stay With Your Forever.
"""]

In [61]:
cv= TfidfVectorizer()
cv.fit_transform(text)

<1x109 sparse matrix of type '<class 'numpy.float64'>'
	with 109 stored elements in Compressed Sparse Row format>

In [68]:
cv.get_feature_names_out()

array(['add', 'after', 'amazing', 'and', 'are', 'as', 'at', 'be',
       'beautiful', 'before', 'best', 'big', 'birch', 'birthday',
       'blends', 'blond', 'can', 'close', 'colour', 'decaration', 'do',
       'email', 'engraved', 'engraving', 'final', 'for', 'forever',
       'gift', 'gifts', 'hanging', 'happy', 'home', 'imported', 'in',
       'inch', 'incredible', 'india', 'is', 'it', 'just', 'laser',
       'lasts', 'let', 'life', 'light', 'longer', 'look', 'made',
       'making', 'memories', 'moment', 'most', 'multiple', 'natural',
       'need', 'of', 'offer', 'on', 'one', 'our', 'own', 'pale',
       'particle', 'pattern', 'personalized', 'photo', 'picture', 'place',
       'plaque', 'plaques', 'popular', 'present', 'preview', 'product',
       'quite', 'rest', 'right', 'size', 'sizes', 'soft', 'solid', 'some',
       'stay', 'stays', 'store', 'table', 'text', 'that', 'the', 'then',
       'this', 'time', 'to', 'top', 'unique', 'upload', 'uploaded',
       'used', 'various', '

In [69]:
print(cv.vocabulary_)

{'incredible': 35, 'gifts': 28, 'india': 36, 'wooden': 106, 'happy': 30, 'birthday': 13, 'unique': 94, 'personalized': 64, 'gift': 27, 'inch': 34, 'size': 77, 'made': 47, 'of': 55, 'natural': 53, 'imported': 32, 'wood': 105, 'which': 102, 'is': 37, 'quite': 74, 'solid': 80, 'with': 104, 'light': 44, 'particle': 62, 'pattern': 63, 'soft': 79, 'pale': 61, 'to': 92, 'blond': 15, 'colour': 18, 'your': 108, 'uploaded': 96, 'photo': 65, 'will': 103, 'look': 46, 'amazing': 2, 'and': 3, 'beautiful': 8, 'after': 1, 'laser': 40, 'engraving': 23, 'on': 57, 'it': 38, 'this': 90, 'one': 58, 'the': 88, 'most': 51, 'popular': 70, 'in': 33, 'our': 59, 'store': 84, 'we': 101, 'offer': 56, 'multiple': 52, 'sizes': 78, 'some': 81, 'can': 16, 'be': 7, 'used': 97, 'as': 5, 'table': 85, 'top': 93, 'big': 11, 'wall': 99, 'hanging': 29, 'just': 39, 'blends': 14, 'home': 31, 'decaration': 19, 'you': 107, 'need': 54, 'upload': 95, 'picture': 66, 'add': 0, 'own': 60, 'text': 86, 'do': 20, 'rest': 75, 'for': 25, 

In [70]:
cv.get_feature_names_out()[35]

'incredible'

In [71]:
cv.get_feature_names_out()[35:50]

array(['incredible', 'india', 'is', 'it', 'just', 'laser', 'lasts', 'let',
       'life', 'light', 'longer', 'look', 'made', 'making', 'memories'],
      dtype=object)

In [73]:
#Geting the idf score of the words
feature_names= cv.get_feature_names_out()

for word in feature_names:
    feature_index= cv.vocabulary_.get(word)
    print(word, ":", cv.idf_[feature_index])

add : 1.0
after : 1.0
amazing : 1.0
and : 1.0
are : 1.0
as : 1.0
at : 1.0
be : 1.0
beautiful : 1.0
before : 1.0
best : 1.0
big : 1.0
birch : 1.0
birthday : 1.0
blends : 1.0
blond : 1.0
can : 1.0
close : 1.0
colour : 1.0
decaration : 1.0
do : 1.0
email : 1.0
engraved : 1.0
engraving : 1.0
final : 1.0
for : 1.0
forever : 1.0
gift : 1.0
gifts : 1.0
hanging : 1.0
happy : 1.0
home : 1.0
imported : 1.0
in : 1.0
inch : 1.0
incredible : 1.0
india : 1.0
is : 1.0
it : 1.0
just : 1.0
laser : 1.0
lasts : 1.0
let : 1.0
life : 1.0
light : 1.0
longer : 1.0
look : 1.0
made : 1.0
making : 1.0
memories : 1.0
moment : 1.0
most : 1.0
multiple : 1.0
natural : 1.0
need : 1.0
of : 1.0
offer : 1.0
on : 1.0
one : 1.0
our : 1.0
own : 1.0
pale : 1.0
particle : 1.0
pattern : 1.0
personalized : 1.0
photo : 1.0
picture : 1.0
place : 1.0
plaque : 1.0
plaques : 1.0
popular : 1.0
present : 1.0
preview : 1.0
product : 1.0
quite : 1.0
rest : 1.0
right : 1.0
size : 1.0
sizes : 1.0
soft : 1.0
solid : 1.0
some : 1.0
stay :

# SPACY WORD VECTOR (WORD EMBEDDING)

In [51]:
nlp= spacy.load("en_core_web_lg")

In [52]:
doc= nlp("""Incredible Gifts India Wooden Happy Birthday Unique Personalized Gift (5 X 4 Inch) Size:4 x 5   
Made Of Natural Imported Wood, Which Is Quite Solid With Light Particle Pattern & Is Soft Pale To Blond Colour. 
Your Uploaded Photo Will Look Amazing And Beautiful After Laser Engraving On It. This Is One Of The Most Popular Unique Gifts In Our Store. 
We Offer This In Multiple Sizes, 
Some Can Be Used As Table Top And The Big Sizes Can Be Used As Wall Hanging Which Just Blends With Your Home Decaration. 
You Just Need To Upload A Picture And Add Your Own Text And We Will Do The Rest For You. 
We Will Email You The Preview Before Making The Final Product. 
Do You Want The Best Moment Of Your Life To Be Engraved On A Wooden Plaque That Lasts For A Longer Time And Stays Close To You Forever? 
Then You Are At The Right Place. 
We Present To You Various Sizes Personalized Engraved Wooden Plaques Made With Birch Wood. 
Let Your Memories Be Engraved On Wooden Plaques And Stay With Your Forever.
""")

In [54]:
doc.vector[:5]

array([-0.49471596, -0.44727975, -1.3359691 , -0.3698964 ,  2.3133612 ],
      dtype=float32)